19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 749ae82921SPaul Mullowney 757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 817f756511SDominic Meiser 82b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 83b06137fdSPaul Mullowney { 84b06137fdSPaul Mullowney cusparseStatus_t stat; 85b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 86b06137fdSPaul Mullowney 87b06137fdSPaul Mullowney PetscFunctionBegin; 88*d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 89b06137fdSPaul Mullowney cusparsestruct->stream = stream; 9057d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 91b06137fdSPaul Mullowney PetscFunctionReturn(0); 92b06137fdSPaul Mullowney } 93b06137fdSPaul Mullowney 94b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 95b06137fdSPaul Mullowney { 96b06137fdSPaul Mullowney cusparseStatus_t stat; 97b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98b06137fdSPaul Mullowney 99b06137fdSPaul Mullowney PetscFunctionBegin; 100*d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1016b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 10216a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 10357d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 10416a2e217SAlejandro Lamas Daviña } 105b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1066b1cf21dSAlejandro Lamas Daviña } 10757d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 108b06137fdSPaul Mullowney PetscFunctionReturn(0); 109b06137fdSPaul Mullowney } 110b06137fdSPaul Mullowney 111b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 112b06137fdSPaul Mullowney { 113b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 114ccdfe979SStefano Zampini 115b06137fdSPaul Mullowney PetscFunctionBegin; 116*d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 117ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 118b06137fdSPaul Mullowney PetscFunctionReturn(0); 119b06137fdSPaul Mullowney } 120b06137fdSPaul Mullowney 121ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1229ae82921SPaul Mullowney { 1239ae82921SPaul Mullowney PetscFunctionBegin; 1249ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1259ae82921SPaul Mullowney PetscFunctionReturn(0); 1269ae82921SPaul Mullowney } 1279ae82921SPaul Mullowney 128c708e6cdSJed Brown /*MC 129087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 130087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 131087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 132087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 133087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 134087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 135c708e6cdSJed Brown 1369ae82921SPaul Mullowney Level: beginner 137c708e6cdSJed Brown 1383ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 139c708e6cdSJed Brown M*/ 1409ae82921SPaul Mullowney 14142c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1429ae82921SPaul Mullowney { 1439ae82921SPaul Mullowney PetscErrorCode ierr; 144bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1459ae82921SPaul Mullowney 1469ae82921SPaul Mullowney PetscFunctionBegin; 147bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 148bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1492c7c0729SBarry Smith (*B)->factortype = ftype; 1502c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1519ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1522205254eSKarl Rupp 153087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 15433d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1559ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1569ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 157087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 158087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 159087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1609ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 161bc3f50f2SPaul Mullowney 162fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1633ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1649ae82921SPaul Mullowney PetscFunctionReturn(0); 1659ae82921SPaul Mullowney } 1669ae82921SPaul Mullowney 167bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 168ca45077fSPaul Mullowney { 169aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1706e111a19SKarl Rupp 171ca45077fSPaul Mullowney PetscFunctionBegin; 172ca45077fSPaul Mullowney switch (op) { 173e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 174aa372e3fSPaul Mullowney cusparsestruct->format = format; 175ca45077fSPaul Mullowney break; 176e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 177aa372e3fSPaul Mullowney cusparsestruct->format = format; 178ca45077fSPaul Mullowney break; 179ca45077fSPaul Mullowney default: 18036d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 181ca45077fSPaul Mullowney } 182ca45077fSPaul Mullowney PetscFunctionReturn(0); 183ca45077fSPaul Mullowney } 1849ae82921SPaul Mullowney 185e057df02SPaul Mullowney /*@ 186e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 187e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 188aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 189e057df02SPaul Mullowney Not Collective 190e057df02SPaul Mullowney 191e057df02SPaul Mullowney Input Parameters: 1928468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 19336d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1942692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 195e057df02SPaul Mullowney 196e057df02SPaul Mullowney Output Parameter: 197e057df02SPaul Mullowney 198e057df02SPaul Mullowney Level: intermediate 199e057df02SPaul Mullowney 2008468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 201e057df02SPaul Mullowney @*/ 202e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 203e057df02SPaul Mullowney { 204e057df02SPaul Mullowney PetscErrorCode ierr; 2056e111a19SKarl Rupp 206e057df02SPaul Mullowney PetscFunctionBegin; 207e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 208e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 209e057df02SPaul Mullowney PetscFunctionReturn(0); 210e057df02SPaul Mullowney } 211e057df02SPaul Mullowney 212e6e9a74fSStefano Zampini /*@ 213e6e9a74fSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 214e6e9a74fSStefano Zampini 215e6e9a74fSStefano Zampini Collective on mat 216e6e9a74fSStefano Zampini 217e6e9a74fSStefano Zampini Input Parameters: 218e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 219e6e9a74fSStefano Zampini - transgen - the boolean flag 220e6e9a74fSStefano Zampini 221e6e9a74fSStefano Zampini Level: intermediate 222e6e9a74fSStefano Zampini 223e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE 224e6e9a74fSStefano Zampini @*/ 225e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 226e6e9a74fSStefano Zampini { 227e6e9a74fSStefano Zampini PetscErrorCode ierr; 228e6e9a74fSStefano Zampini PetscBool flg; 229e6e9a74fSStefano Zampini 230e6e9a74fSStefano Zampini PetscFunctionBegin; 231e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 232e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 233e6e9a74fSStefano Zampini if (flg) { 234e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 23554da937aSStefano Zampini 236e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 237e6e9a74fSStefano Zampini cusp->transgen = transgen; 23854da937aSStefano Zampini if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 23954da937aSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 24054da937aSStefano Zampini } 241e6e9a74fSStefano Zampini } 242e6e9a74fSStefano Zampini PetscFunctionReturn(0); 243e6e9a74fSStefano Zampini } 244e6e9a74fSStefano Zampini 2454416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2469ae82921SPaul Mullowney { 2479ae82921SPaul Mullowney PetscErrorCode ierr; 248e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2499ae82921SPaul Mullowney PetscBool flg; 250a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2516e111a19SKarl Rupp 2529ae82921SPaul Mullowney PetscFunctionBegin; 253e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2549ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 25554da937aSStefano Zampini PetscBool transgen = cusparsestruct->transgen; 25654da937aSStefano Zampini 25754da937aSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 258afb2bd1cSJunchao Zhang if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 259afb2bd1cSJunchao Zhang 260e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 261a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 262afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 263afb2bd1cSJunchao Zhang 2644c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 265a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 266afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 267afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 268afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 269afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 270afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 271afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 272afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 273afb2bd1cSJunchao Zhang 274afb2bd1cSJunchao Zhang cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 275afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 276afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 277afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 278afb2bd1cSJunchao Zhang 279afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 280afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 281afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 282afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 283afb2bd1cSJunchao Zhang #endif 2844c87dfd4SPaul Mullowney } 2850af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2869ae82921SPaul Mullowney PetscFunctionReturn(0); 2879ae82921SPaul Mullowney } 2889ae82921SPaul Mullowney 2896fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2909ae82921SPaul Mullowney { 2919ae82921SPaul Mullowney PetscErrorCode ierr; 2929ae82921SPaul Mullowney 2939ae82921SPaul Mullowney PetscFunctionBegin; 2949ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2959ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2969ae82921SPaul Mullowney PetscFunctionReturn(0); 2979ae82921SPaul Mullowney } 2989ae82921SPaul Mullowney 2996fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3009ae82921SPaul Mullowney { 3019ae82921SPaul Mullowney PetscErrorCode ierr; 3029ae82921SPaul Mullowney 3039ae82921SPaul Mullowney PetscFunctionBegin; 3049ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3059ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3069ae82921SPaul Mullowney PetscFunctionReturn(0); 3079ae82921SPaul Mullowney } 3089ae82921SPaul Mullowney 309087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 310087f3262SPaul Mullowney { 311087f3262SPaul Mullowney PetscErrorCode ierr; 312087f3262SPaul Mullowney 313087f3262SPaul Mullowney PetscFunctionBegin; 314087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 315087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 316087f3262SPaul Mullowney PetscFunctionReturn(0); 317087f3262SPaul Mullowney } 318087f3262SPaul Mullowney 319087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 320087f3262SPaul Mullowney { 321087f3262SPaul Mullowney PetscErrorCode ierr; 322087f3262SPaul Mullowney 323087f3262SPaul Mullowney PetscFunctionBegin; 324087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 325087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 326087f3262SPaul Mullowney PetscFunctionReturn(0); 327087f3262SPaul Mullowney } 328087f3262SPaul Mullowney 329087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3309ae82921SPaul Mullowney { 3319ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3329ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3339ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 334aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3359ae82921SPaul Mullowney cusparseStatus_t stat; 3369ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3379ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3389ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3399ae82921SPaul Mullowney PetscScalar *AALo; 3409ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 341b175d8bbSPaul Mullowney PetscErrorCode ierr; 34257d48284SJunchao Zhang cudaError_t cerr; 3439ae82921SPaul Mullowney 3449ae82921SPaul Mullowney PetscFunctionBegin; 345cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 346c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3479ae82921SPaul Mullowney try { 3489ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3499ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 3509ae82921SPaul Mullowney 3519ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 35257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 35357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 35457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3559ae82921SPaul Mullowney 3569ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3579ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3589ae82921SPaul Mullowney AiLo[n] = nzLower; 3599ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3609ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3619ae82921SPaul Mullowney v = aa; 3629ae82921SPaul Mullowney vi = aj; 3639ae82921SPaul Mullowney offset = 1; 3649ae82921SPaul Mullowney rowOffset= 1; 3659ae82921SPaul Mullowney for (i=1; i<n; i++) { 3669ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 367e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3689ae82921SPaul Mullowney AiLo[i] = rowOffset; 3699ae82921SPaul Mullowney rowOffset += nz+1; 3709ae82921SPaul Mullowney 371580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 372580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3739ae82921SPaul Mullowney 3749ae82921SPaul Mullowney offset += nz; 3759ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3769ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3779ae82921SPaul Mullowney offset += 1; 3789ae82921SPaul Mullowney 3799ae82921SPaul Mullowney v += nz; 3809ae82921SPaul Mullowney vi += nz; 3819ae82921SPaul Mullowney } 3822205254eSKarl Rupp 383aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 384aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3852205254eSKarl Rupp 386aa372e3fSPaul Mullowney /* Create the matrix description */ 38757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 38857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 389afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 390afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 391afb2bd1cSJunchao Zhang #else 39257d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 393afb2bd1cSJunchao Zhang #endif 39457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 39557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 396aa372e3fSPaul Mullowney 397aa372e3fSPaul Mullowney /* set the operation */ 398aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399aa372e3fSPaul Mullowney 400aa372e3fSPaul Mullowney /* set the matrix */ 401aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 402aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 403aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 404aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 405aa372e3fSPaul Mullowney 406aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 407aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 408aa372e3fSPaul Mullowney 409aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 410aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 411aa372e3fSPaul Mullowney 412aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 413aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 414aa372e3fSPaul Mullowney 415afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 416afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 417afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 418afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 419afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 420afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 421afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 422afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 423afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 424afb2bd1cSJunchao Zhang #endif 425afb2bd1cSJunchao Zhang 426aa372e3fSPaul Mullowney /* perform the solve analysis */ 427aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 428aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 429aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 430afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 431afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 432afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 433afb2bd1cSJunchao Zhang #endif 434afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 435aa372e3fSPaul Mullowney 436aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 437aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4382205254eSKarl Rupp 43957d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 44057d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 44157d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 4424863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 4439ae82921SPaul Mullowney } catch(char *ex) { 4449ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4459ae82921SPaul Mullowney } 4469ae82921SPaul Mullowney } 4479ae82921SPaul Mullowney PetscFunctionReturn(0); 4489ae82921SPaul Mullowney } 4499ae82921SPaul Mullowney 450087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4519ae82921SPaul Mullowney { 4529ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4539ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4549ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 455aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4569ae82921SPaul Mullowney cusparseStatus_t stat; 4579ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4589ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4599ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4609ae82921SPaul Mullowney PetscScalar *AAUp; 4619ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4629ae82921SPaul Mullowney PetscErrorCode ierr; 46357d48284SJunchao Zhang cudaError_t cerr; 4649ae82921SPaul Mullowney 4659ae82921SPaul Mullowney PetscFunctionBegin; 466cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 467c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4689ae82921SPaul Mullowney try { 4699ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4709ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 4719ae82921SPaul Mullowney 4729ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 47357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 47457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 47557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 4769ae82921SPaul Mullowney 4779ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4789ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4799ae82921SPaul Mullowney AiUp[n]=nzUpper; 4809ae82921SPaul Mullowney offset = nzUpper; 4819ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4829ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4839ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4849ae82921SPaul Mullowney 485e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4869ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4879ae82921SPaul Mullowney 488e057df02SPaul Mullowney /* decrement the offset */ 4899ae82921SPaul Mullowney offset -= (nz+1); 4909ae82921SPaul Mullowney 491e057df02SPaul Mullowney /* first, set the diagonal elements */ 4929ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 49309f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 4949ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4959ae82921SPaul Mullowney 496580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 497580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 4989ae82921SPaul Mullowney } 4992205254eSKarl Rupp 500aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 501aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 5022205254eSKarl Rupp 503aa372e3fSPaul Mullowney /* Create the matrix description */ 50457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 50557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 506afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 507afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 508afb2bd1cSJunchao Zhang #else 50957d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 510afb2bd1cSJunchao Zhang #endif 51157d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 51257d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 513aa372e3fSPaul Mullowney 514aa372e3fSPaul Mullowney /* set the operation */ 515aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 516aa372e3fSPaul Mullowney 517aa372e3fSPaul Mullowney /* set the matrix */ 518aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 519aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 520aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 521aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 522aa372e3fSPaul Mullowney 523aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 524aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 525aa372e3fSPaul Mullowney 526aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 527aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 528aa372e3fSPaul Mullowney 529aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 530aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 531aa372e3fSPaul Mullowney 532afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 533afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 534afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 535afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 536afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 537afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 538afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 539afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 540afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 541afb2bd1cSJunchao Zhang #endif 542afb2bd1cSJunchao Zhang 543aa372e3fSPaul Mullowney /* perform the solve analysis */ 544aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 545aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 546aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 547afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 548afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 549afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 550afb2bd1cSJunchao Zhang #endif 551afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 552aa372e3fSPaul Mullowney 553aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 554aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5552205254eSKarl Rupp 55657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 55757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 55857d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 5594863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 5609ae82921SPaul Mullowney } catch(char *ex) { 5619ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5629ae82921SPaul Mullowney } 5639ae82921SPaul Mullowney } 5649ae82921SPaul Mullowney PetscFunctionReturn(0); 5659ae82921SPaul Mullowney } 5669ae82921SPaul Mullowney 567087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 5689ae82921SPaul Mullowney { 5699ae82921SPaul Mullowney PetscErrorCode ierr; 5709ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5719ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 5729ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 5739ae82921SPaul Mullowney PetscBool row_identity,col_identity; 5749ae82921SPaul Mullowney const PetscInt *r,*c; 5759ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5769ae82921SPaul Mullowney 5779ae82921SPaul Mullowney PetscFunctionBegin; 578ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 579087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 580087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 5812205254eSKarl Rupp 582e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 583aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 5849ae82921SPaul Mullowney 585c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 586e057df02SPaul Mullowney /* lower triangular indices */ 5879ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5889ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5892205254eSKarl Rupp if (!row_identity) { 590aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 591aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 5922205254eSKarl Rupp } 5939ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5949ae82921SPaul Mullowney 595e057df02SPaul Mullowney /* upper triangular indices */ 5969ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 5979ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5982205254eSKarl Rupp if (!col_identity) { 599aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 600aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6012205254eSKarl Rupp } 6024863603aSSatish Balay 6034863603aSSatish Balay if (!row_identity && !col_identity) { 6044863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 6054863603aSSatish Balay } else if(!row_identity) { 6064863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 6074863603aSSatish Balay } else if(!col_identity) { 6084863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 6094863603aSSatish Balay } 6104863603aSSatish Balay 6119ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 6129ae82921SPaul Mullowney PetscFunctionReturn(0); 6139ae82921SPaul Mullowney } 6149ae82921SPaul Mullowney 615087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 616087f3262SPaul Mullowney { 617087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 618087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 619aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 620aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 621087f3262SPaul Mullowney cusparseStatus_t stat; 622087f3262SPaul Mullowney PetscErrorCode ierr; 62357d48284SJunchao Zhang cudaError_t cerr; 624087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 625087f3262SPaul Mullowney PetscScalar *AAUp; 626087f3262SPaul Mullowney PetscScalar *AALo; 627087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 628087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 629087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 630087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 631087f3262SPaul Mullowney 632087f3262SPaul Mullowney PetscFunctionBegin; 633cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 634c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 635087f3262SPaul Mullowney try { 636087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 63757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 63857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 63957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 64057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 641087f3262SPaul Mullowney 642087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 643087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 644087f3262SPaul Mullowney AiUp[n]=nzUpper; 645087f3262SPaul Mullowney offset = 0; 646087f3262SPaul Mullowney for (i=0; i<n; i++) { 647087f3262SPaul Mullowney /* set the pointers */ 648087f3262SPaul Mullowney v = aa + ai[i]; 649087f3262SPaul Mullowney vj = aj + ai[i]; 650087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 651087f3262SPaul Mullowney 652087f3262SPaul Mullowney /* first, set the diagonal elements */ 653087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 65409f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 655087f3262SPaul Mullowney AiUp[i] = offset; 65609f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 657087f3262SPaul Mullowney 658087f3262SPaul Mullowney offset+=1; 659087f3262SPaul Mullowney if (nz>0) { 660f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 661580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 662087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 663087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 664087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 665087f3262SPaul Mullowney } 666087f3262SPaul Mullowney offset+=nz; 667087f3262SPaul Mullowney } 668087f3262SPaul Mullowney } 669087f3262SPaul Mullowney 670aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 671aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 672087f3262SPaul Mullowney 673aa372e3fSPaul Mullowney /* Create the matrix description */ 67457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 67557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 676afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 677afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 678afb2bd1cSJunchao Zhang #else 67957d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 680afb2bd1cSJunchao Zhang #endif 68157d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 68257d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 683087f3262SPaul Mullowney 684aa372e3fSPaul Mullowney /* set the matrix */ 685aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 686aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 687aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 688aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 689aa372e3fSPaul Mullowney 690aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 691aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 692aa372e3fSPaul Mullowney 693aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 694aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 695aa372e3fSPaul Mullowney 696aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 697aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 698aa372e3fSPaul Mullowney 699afb2bd1cSJunchao Zhang /* set the operation */ 700afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 701afb2bd1cSJunchao Zhang 702afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 703afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 704afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 705afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 706afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 707afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 708afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 709afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 710afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 711afb2bd1cSJunchao Zhang #endif 712afb2bd1cSJunchao Zhang 713aa372e3fSPaul Mullowney /* perform the solve analysis */ 714aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 715aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 716aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 717afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 718afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 719afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 720afb2bd1cSJunchao Zhang #endif 721afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 722aa372e3fSPaul Mullowney 723aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 724aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 725aa372e3fSPaul Mullowney 726aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 727aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 728aa372e3fSPaul Mullowney 729aa372e3fSPaul Mullowney /* Create the matrix description */ 73057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 73157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 732afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 733afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 734afb2bd1cSJunchao Zhang #else 73557d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 736afb2bd1cSJunchao Zhang #endif 73757d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 73857d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 739aa372e3fSPaul Mullowney 740aa372e3fSPaul Mullowney /* set the operation */ 741aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 742aa372e3fSPaul Mullowney 743aa372e3fSPaul Mullowney /* set the matrix */ 744aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 745aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 746aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 747aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 748aa372e3fSPaul Mullowney 749aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 750aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 751aa372e3fSPaul Mullowney 752aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 753aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 754aa372e3fSPaul Mullowney 755aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 756aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 7574863603aSSatish Balay ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 758aa372e3fSPaul Mullowney 759afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 760afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 761afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 762afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 763afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 764afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 765afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 766afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 767afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 768afb2bd1cSJunchao Zhang #endif 769afb2bd1cSJunchao Zhang 770aa372e3fSPaul Mullowney /* perform the solve analysis */ 771aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 772aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 773aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 774afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 775afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 776afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 777afb2bd1cSJunchao Zhang #endif 778afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 779aa372e3fSPaul Mullowney 780aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 781aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 782087f3262SPaul Mullowney 783c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 78457d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 78557d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 78657d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 78757d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 788087f3262SPaul Mullowney } catch(char *ex) { 789087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 790087f3262SPaul Mullowney } 791087f3262SPaul Mullowney } 792087f3262SPaul Mullowney PetscFunctionReturn(0); 793087f3262SPaul Mullowney } 794087f3262SPaul Mullowney 795087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 7969ae82921SPaul Mullowney { 7979ae82921SPaul Mullowney PetscErrorCode ierr; 798087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 799087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 800087f3262SPaul Mullowney IS ip = a->row; 801087f3262SPaul Mullowney const PetscInt *rip; 802087f3262SPaul Mullowney PetscBool perm_identity; 803087f3262SPaul Mullowney PetscInt n = A->rmap->n; 804087f3262SPaul Mullowney 805087f3262SPaul Mullowney PetscFunctionBegin; 806ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 807087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 808e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 809aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 810aa372e3fSPaul Mullowney 811087f3262SPaul Mullowney /* lower triangular indices */ 812087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 813087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 814087f3262SPaul Mullowney if (!perm_identity) { 8154e4bbfaaSStefano Zampini IS iip; 8164e4bbfaaSStefano Zampini const PetscInt *irip; 8174e4bbfaaSStefano Zampini 8184e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 8194e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 820aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 821aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 822aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8234e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 8244e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 8254e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 8264863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 827087f3262SPaul Mullowney } 828087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 829087f3262SPaul Mullowney PetscFunctionReturn(0); 830087f3262SPaul Mullowney } 831087f3262SPaul Mullowney 8326fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 8339ae82921SPaul Mullowney { 8349ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 8359ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 8369ae82921SPaul Mullowney PetscBool row_identity,col_identity; 837b175d8bbSPaul Mullowney PetscErrorCode ierr; 8389ae82921SPaul Mullowney 8399ae82921SPaul Mullowney PetscFunctionBegin; 8409ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 841ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 842e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8439ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 8449ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 845bda325fcSPaul Mullowney if (row_identity && col_identity) { 846bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 847bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8484e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8494e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 850bda325fcSPaul Mullowney } else { 851bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 852bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8534e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8544e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 855bda325fcSPaul Mullowney } 8568dc1d2a3SPaul Mullowney 857e057df02SPaul Mullowney /* get the triangular factors */ 858087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 8599ae82921SPaul Mullowney PetscFunctionReturn(0); 8609ae82921SPaul Mullowney } 8619ae82921SPaul Mullowney 862087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 863087f3262SPaul Mullowney { 864087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 865087f3262SPaul Mullowney IS ip = b->row; 866087f3262SPaul Mullowney PetscBool perm_identity; 867b175d8bbSPaul Mullowney PetscErrorCode ierr; 868087f3262SPaul Mullowney 869087f3262SPaul Mullowney PetscFunctionBegin; 870087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 871ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 872087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 873087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 874087f3262SPaul Mullowney if (perm_identity) { 875087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 876087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8774e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8784e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 879087f3262SPaul Mullowney } else { 880087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 881087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8824e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8834e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 884087f3262SPaul Mullowney } 885087f3262SPaul Mullowney 886087f3262SPaul Mullowney /* get the triangular factors */ 887087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 888087f3262SPaul Mullowney PetscFunctionReturn(0); 889087f3262SPaul Mullowney } 8909ae82921SPaul Mullowney 891b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 892bda325fcSPaul Mullowney { 893bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 894aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 895aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 896aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 897aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 898bda325fcSPaul Mullowney cusparseStatus_t stat; 899aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 900aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 901aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 902aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 903b175d8bbSPaul Mullowney 904bda325fcSPaul Mullowney PetscFunctionBegin; 905bda325fcSPaul Mullowney 906aa372e3fSPaul Mullowney /*********************************************/ 907aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 908aa372e3fSPaul Mullowney /*********************************************/ 909aa372e3fSPaul Mullowney 910aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 911aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 912aa372e3fSPaul Mullowney 913aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 914aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 915aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 916aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 917aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 918aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 919aa372e3fSPaul Mullowney 920aa372e3fSPaul Mullowney /* Create the matrix description */ 92157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 92257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 92357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 92457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 92557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 926aa372e3fSPaul Mullowney 927aa372e3fSPaul Mullowney /* set the operation */ 928aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 929aa372e3fSPaul Mullowney 930aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 931aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 932afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 933afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 934aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 935afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 936afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 937afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 938aa372e3fSPaul Mullowney 939aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 940afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 941afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 942afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 943afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 944afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 945afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 946afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 947afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 948afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 949afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 950afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 951afb2bd1cSJunchao Zhang #endif 952afb2bd1cSJunchao Zhang 953aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 954aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 955aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 956aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 957aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 958aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 959afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 960afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 961afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 962afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 963afb2bd1cSJunchao Zhang #else 964afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 965afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 966afb2bd1cSJunchao Zhang #endif 967afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 968aa372e3fSPaul Mullowney 969afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 970afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 971afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 972afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 973afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 974afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 975afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 976afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 977afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 978afb2bd1cSJunchao Zhang #endif 979afb2bd1cSJunchao Zhang 980afb2bd1cSJunchao Zhang /* perform the solve analysis */ 981aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 982afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 983afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 984afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 985afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 986afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 987afb2bd1cSJunchao Zhang #endif 988afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 989aa372e3fSPaul Mullowney 990aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 991aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 992aa372e3fSPaul Mullowney 993aa372e3fSPaul Mullowney /*********************************************/ 994aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 995aa372e3fSPaul Mullowney /*********************************************/ 996aa372e3fSPaul Mullowney 997aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 998aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 999aa372e3fSPaul Mullowney 1000aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1001aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1002aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1003aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1004aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1005aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1006aa372e3fSPaul Mullowney 1007aa372e3fSPaul Mullowney /* Create the matrix description */ 100857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 100957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 101057d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 101157d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 101257d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1013aa372e3fSPaul Mullowney 1014aa372e3fSPaul Mullowney /* set the operation */ 1015aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1016aa372e3fSPaul Mullowney 1017aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1018aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1019afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1020afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1021aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1022afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1023afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1024afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1025aa372e3fSPaul Mullowney 1026aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1027afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1028afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1029afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1030afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1031afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1032afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1033afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1034afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1035afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1036afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1037afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1038afb2bd1cSJunchao Zhang #endif 1039afb2bd1cSJunchao Zhang 1040aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1041aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1042aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1043aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1044aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1045aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1046afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1047afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1049afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1050afb2bd1cSJunchao Zhang #else 1051afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1052afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1053afb2bd1cSJunchao Zhang #endif 1054afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1055aa372e3fSPaul Mullowney 1056afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1057afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1058afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1059afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1060afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1061afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1062afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1063afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1064afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1065afb2bd1cSJunchao Zhang #endif 1066afb2bd1cSJunchao Zhang 1067afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1068aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1069afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1070afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1071afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1072afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1073afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1074afb2bd1cSJunchao Zhang #endif 1075afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1076aa372e3fSPaul Mullowney 1077aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 1078aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079bda325fcSPaul Mullowney PetscFunctionReturn(0); 1080bda325fcSPaul Mullowney } 1081bda325fcSPaul Mullowney 1082b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1083bda325fcSPaul Mullowney { 1084aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1085aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1086aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1087bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1088bda325fcSPaul Mullowney cusparseStatus_t stat; 1089aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1090b06137fdSPaul Mullowney cudaError_t err; 109185ba7357SStefano Zampini PetscErrorCode ierr; 1092b175d8bbSPaul Mullowney 1093bda325fcSPaul Mullowney PetscFunctionBegin; 109485ba7357SStefano Zampini if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 109585ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 109685ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 109785ba7357SStefano Zampini /* create cusparse matrix */ 1098aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 109957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1100aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 110157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 110257d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1103aa372e3fSPaul Mullowney 1104b06137fdSPaul Mullowney /* set alpha and beta */ 1105afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 11067656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 11077656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1108afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 11097656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 11107656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 111157d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1112b06137fdSPaul Mullowney 1113aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1114aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1115aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 1116554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1117554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1118aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1119a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1120aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1121aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1122a3fdcf43SKarl Rupp 112381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 112481902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1125afb2bd1cSJunchao Zhang 112681902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 1127afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1128afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1129afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1130afb2bd1cSJunchao Zhang matrix->values->data().get(), 1131afb2bd1cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1132afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1133afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1134afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1135afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1136afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1137afb2bd1cSJunchao Zhang err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1138afb2bd1cSJunchao Zhang #endif 1139afb2bd1cSJunchao Zhang 1140a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1141a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 1142aa372e3fSPaul Mullowney matrix->values->data().get(), 114381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1144aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1145aa372e3fSPaul Mullowney matrixT->values->data().get(), 1146afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1148afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1149afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1150afb2bd1cSJunchao Zhang #else 1151afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1152afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1153afb2bd1cSJunchao Zhang #endif 1154afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1155aa372e3fSPaul Mullowney matstructT->mat = matrixT; 1156afb2bd1cSJunchao Zhang 1157afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1158afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1159afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1160afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1161afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1162afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1163afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1164afb2bd1cSJunchao Zhang #endif 1165aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1166afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1167afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1168afb2bd1cSJunchao Zhang #else 1169aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 117051c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 117151c6d536SStefano Zampini /* First convert HYB to CSR */ 1172aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1173aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1174aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1175aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1176aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1177aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1178aa372e3fSPaul Mullowney 1179aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1180aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1181aa372e3fSPaul Mullowney temp->values->data().get(), 1182aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 118357d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1184aa372e3fSPaul Mullowney 1185aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1186aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1187aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1188aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1189aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1190aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1191aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1192aa372e3fSPaul Mullowney 1193aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1194aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1195aa372e3fSPaul Mullowney temp->values->data().get(), 1196aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1197aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1198aa372e3fSPaul Mullowney tempT->values->data().get(), 1199aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1200aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 120157d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1202aa372e3fSPaul Mullowney 1203aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1204aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 120557d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1206aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1207aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1208aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1209aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1210aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1211aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 121257d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1213aa372e3fSPaul Mullowney 1214aa372e3fSPaul Mullowney /* assign the pointer */ 1215aa372e3fSPaul Mullowney matstructT->mat = hybMat; 1216aa372e3fSPaul Mullowney /* delete temporaries */ 1217aa372e3fSPaul Mullowney if (tempT) { 1218aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1219aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1220aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1221aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1222087f3262SPaul Mullowney } 1223aa372e3fSPaul Mullowney if (temp) { 1224aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1225aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1226aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1227aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1228aa372e3fSPaul Mullowney } 1229afb2bd1cSJunchao Zhang #endif 1230aa372e3fSPaul Mullowney } 123105035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 123285ba7357SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 123385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1234213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1235213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1236aa372e3fSPaul Mullowney /* assign the pointer */ 1237aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1238bda325fcSPaul Mullowney PetscFunctionReturn(0); 1239bda325fcSPaul Mullowney } 1240bda325fcSPaul Mullowney 12414e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 12426fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1243bda325fcSPaul Mullowney { 1244c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1245465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1246465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1247465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1248465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1249bda325fcSPaul Mullowney cusparseStatus_t stat; 1250bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1251aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1252aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1253aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1254b175d8bbSPaul Mullowney PetscErrorCode ierr; 125557d48284SJunchao Zhang cudaError_t cerr; 1256bda325fcSPaul Mullowney 1257bda325fcSPaul Mullowney PetscFunctionBegin; 1258aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1259aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1260bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1261aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1262aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1263bda325fcSPaul Mullowney } 1264bda325fcSPaul Mullowney 1265bda325fcSPaul Mullowney /* Get the GPU pointers */ 1266c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1267c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1268c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1269c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1270bda325fcSPaul Mullowney 12717a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1272aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1273c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1274c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1275c41cb2e2SAlejandro Lamas Daviña xGPU); 1276aa372e3fSPaul Mullowney 1277aa372e3fSPaul Mullowney /* First, solve U */ 1278aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1279afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 1280afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1281afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1282afb2bd1cSJunchao Zhang #endif 1283afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1284aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1285aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1286aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1287aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1288afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 1289afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1290afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1291afb2bd1cSJunchao Zhang #endif 1292afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1293aa372e3fSPaul Mullowney 1294aa372e3fSPaul Mullowney /* Then, solve L */ 1295aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1296afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 1297afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1298afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1299afb2bd1cSJunchao Zhang #endif 1300afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1301aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1302aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1303aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1304aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1305afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1306afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1307afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1308afb2bd1cSJunchao Zhang #endif 1309afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1310aa372e3fSPaul Mullowney 1311aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1312c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1313c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1314aa372e3fSPaul Mullowney tempGPU->begin()); 1315aa372e3fSPaul Mullowney 1316aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1317c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1318bda325fcSPaul Mullowney 1319bda325fcSPaul Mullowney /* restore */ 1320c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1321c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 132205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1323661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1324958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1325bda325fcSPaul Mullowney PetscFunctionReturn(0); 1326bda325fcSPaul Mullowney } 1327bda325fcSPaul Mullowney 13286fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1329bda325fcSPaul Mullowney { 1330465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1331465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1332bda325fcSPaul Mullowney cusparseStatus_t stat; 1333bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1334aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1335aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1336aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1337b175d8bbSPaul Mullowney PetscErrorCode ierr; 133857d48284SJunchao Zhang cudaError_t cerr; 1339bda325fcSPaul Mullowney 1340bda325fcSPaul Mullowney PetscFunctionBegin; 1341aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1342aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1343bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1344aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1345aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1346bda325fcSPaul Mullowney } 1347bda325fcSPaul Mullowney 1348bda325fcSPaul Mullowney /* Get the GPU pointers */ 1349c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1350c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1351bda325fcSPaul Mullowney 13527a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1353aa372e3fSPaul Mullowney /* First, solve U */ 1354aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 1356afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1357afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1358afb2bd1cSJunchao Zhang #endif 1359afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1361aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1362aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1363aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1364afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 1365afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1366afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1367afb2bd1cSJunchao Zhang #endif 1368afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1369aa372e3fSPaul Mullowney 1370aa372e3fSPaul Mullowney /* Then, solve L */ 1371aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1372afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 1373afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1374afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1375afb2bd1cSJunchao Zhang #endif 1376afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1377aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1378aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1379aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1380aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1381afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1382afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1384afb2bd1cSJunchao Zhang #endif 1385afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1386bda325fcSPaul Mullowney 1387bda325fcSPaul Mullowney /* restore */ 1388c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1389c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 139005035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1391661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1392958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1393bda325fcSPaul Mullowney PetscFunctionReturn(0); 1394bda325fcSPaul Mullowney } 1395bda325fcSPaul Mullowney 13966fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 13979ae82921SPaul Mullowney { 1398465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1399465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1400465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1401465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 14029ae82921SPaul Mullowney cusparseStatus_t stat; 14039ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1404aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1405aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1406aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1407b175d8bbSPaul Mullowney PetscErrorCode ierr; 140857d48284SJunchao Zhang cudaError_t cerr; 14099ae82921SPaul Mullowney 14109ae82921SPaul Mullowney PetscFunctionBegin; 1411ebc8f436SDominic Meiser 1412e057df02SPaul Mullowney /* Get the GPU pointers */ 1413c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1414c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1415c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1416c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 14179ae82921SPaul Mullowney 14187a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1419aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1420c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1421c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 14224e4bbfaaSStefano Zampini tempGPU->begin()); 1423aa372e3fSPaul Mullowney 1424aa372e3fSPaul Mullowney /* Next, solve L */ 1425aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1426afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 1427afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1428afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1429afb2bd1cSJunchao Zhang #endif 1430afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1431aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1432aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1433aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1434aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1435afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1436afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1437afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1438afb2bd1cSJunchao Zhang #endif 1439afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1440aa372e3fSPaul Mullowney 1441aa372e3fSPaul Mullowney /* Then, solve U */ 1442aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1443afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 1444afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1445afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1446afb2bd1cSJunchao Zhang #endif 1447afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1448aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1449aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1450aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1451aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1452afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 1453afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1454afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1455afb2bd1cSJunchao Zhang #endif 1456afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1457aa372e3fSPaul Mullowney 14584e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 14594e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 14604e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 14614e4bbfaaSStefano Zampini xGPU); 14629ae82921SPaul Mullowney 1463c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1464c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 146505035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1466661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1467958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 14689ae82921SPaul Mullowney PetscFunctionReturn(0); 14699ae82921SPaul Mullowney } 14709ae82921SPaul Mullowney 14716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 14729ae82921SPaul Mullowney { 1473465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1474465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 14759ae82921SPaul Mullowney cusparseStatus_t stat; 14769ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1477aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1478aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1479aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1480b175d8bbSPaul Mullowney PetscErrorCode ierr; 148157d48284SJunchao Zhang cudaError_t cerr; 14829ae82921SPaul Mullowney 14839ae82921SPaul Mullowney PetscFunctionBegin; 1484e057df02SPaul Mullowney /* Get the GPU pointers */ 1485c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1486c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 14879ae82921SPaul Mullowney 14887a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1489aa372e3fSPaul Mullowney /* First, solve L */ 1490aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1491afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 1492afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1493afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1494afb2bd1cSJunchao Zhang #endif 1495afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1496aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1497aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1498aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1499aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1500afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 1501afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1502afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1503afb2bd1cSJunchao Zhang #endif 1504afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1505aa372e3fSPaul Mullowney 1506aa372e3fSPaul Mullowney /* Next, solve U */ 1507aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1508afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 1509afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1510afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1511afb2bd1cSJunchao Zhang #endif 1512afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1513aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1514aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1515aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1516aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1517afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1518afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1519afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1520afb2bd1cSJunchao Zhang #endif 1521afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 15229ae82921SPaul Mullowney 1523c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1524c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 152505035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1526661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1527958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 15289ae82921SPaul Mullowney PetscFunctionReturn(0); 15299ae82921SPaul Mullowney } 15309ae82921SPaul Mullowney 15316fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 15329ae82921SPaul Mullowney { 1533aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 15347c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 15359ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1536213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 15379ae82921SPaul Mullowney PetscErrorCode ierr; 1538aa372e3fSPaul Mullowney cusparseStatus_t stat; 1539b06137fdSPaul Mullowney cudaError_t err; 15409ae82921SPaul Mullowney 15419ae82921SPaul Mullowney PetscFunctionBegin; 154295639643SRichard Tran Mills if (A->boundtocpu) PetscFunctionReturn(0); 1543c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 154481902715SJunchao Zhang if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 154581902715SJunchao Zhang /* Copy values only */ 1546afb2bd1cSJunchao Zhang CsrMatrix *matrix,*matrixT; 1547afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 154885ba7357SStefano Zampini 154985ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1550afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 155105035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 15524863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 155385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 155481902715SJunchao Zhang 155581902715SJunchao Zhang /* Update matT when it was built before */ 155681902715SJunchao Zhang if (cusparsestruct->matTranspose) { 155781902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1558afb2bd1cSJunchao Zhang matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 155985ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 156081902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1561afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1562afb2bd1cSJunchao Zhang matrix->values->data().get(), 156381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1564afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1565afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1566afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1567afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1568afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1569afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1570afb2bd1cSJunchao Zhang #else 1571afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1572afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1573afb2bd1cSJunchao Zhang #endif 1574afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 157505035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 157685ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 157781902715SJunchao Zhang } 157834d6c7a5SJose E. Roman } else { 157985ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 15807c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 15817c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 15827c700b8dSJunchao Zhang delete cusparsestruct->workVector; 158381902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 15849ae82921SPaul Mullowney try { 15859ae82921SPaul Mullowney if (a->compressedrow.use) { 15869ae82921SPaul Mullowney m = a->compressedrow.nrows; 15879ae82921SPaul Mullowney ii = a->compressedrow.i; 15889ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 15899ae82921SPaul Mullowney } else { 1590213423ffSJunchao Zhang m = A->rmap->n; 1591213423ffSJunchao Zhang ii = a->i; 1592e6e9a74fSStefano Zampini ridx = NULL; 15939ae82921SPaul Mullowney } 1594213423ffSJunchao Zhang cusparsestruct->nrows = m; 15959ae82921SPaul Mullowney 159685ba7357SStefano Zampini /* create cusparse matrix */ 1597aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 159857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 159957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 160057d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 16019ae82921SPaul Mullowney 1602afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 16037656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 16047656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1605afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 16067656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 16077656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 160857d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1609b06137fdSPaul Mullowney 1610aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1611aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1612aa372e3fSPaul Mullowney /* set the matrix */ 1613afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1614afb2bd1cSJunchao Zhang mat->num_rows = m; 1615afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1616afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1617afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1618afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 16199ae82921SPaul Mullowney 1620afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1621afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1622aa372e3fSPaul Mullowney 1623afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1624afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1625aa372e3fSPaul Mullowney 1626aa372e3fSPaul Mullowney /* assign the pointer */ 1627afb2bd1cSJunchao Zhang matstruct->mat = mat; 1628afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1629afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1630afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1631afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1632afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1633afb2bd1cSJunchao Zhang mat->values->data().get(), 1634afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1635afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1636afb2bd1cSJunchao Zhang } 1637afb2bd1cSJunchao Zhang #endif 1638aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1639afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1640afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1641afb2bd1cSJunchao Zhang #else 1642afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1643afb2bd1cSJunchao Zhang mat->num_rows = m; 1644afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1645afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1646afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1647afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1648aa372e3fSPaul Mullowney 1649afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1650afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1651aa372e3fSPaul Mullowney 1652afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1653afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1654aa372e3fSPaul Mullowney 1655aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 165657d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1657aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1658aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1659afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1660afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1661afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1662afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 166357d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1664aa372e3fSPaul Mullowney /* assign the pointer */ 1665aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1666aa372e3fSPaul Mullowney 1667afb2bd1cSJunchao Zhang if (mat) { 1668afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1669afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1670afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1671afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1672087f3262SPaul Mullowney } 1673afb2bd1cSJunchao Zhang #endif 1674087f3262SPaul Mullowney } 1675ca45077fSPaul Mullowney 1676aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1677213423ffSJunchao Zhang if (a->compressedrow.use) { 1678213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1679aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1680aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1681213423ffSJunchao Zhang tmp = m; 1682213423ffSJunchao Zhang } else { 1683213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1684213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1685213423ffSJunchao Zhang tmp = 0; 1686213423ffSJunchao Zhang } 1687213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1688aa372e3fSPaul Mullowney 1689aa372e3fSPaul Mullowney /* assign the pointer */ 1690aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 16919ae82921SPaul Mullowney } catch(char *ex) { 16929ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 16939ae82921SPaul Mullowney } 169405035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 169585ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 169634d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 169734d6c7a5SJose E. Roman } 1698c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 16999ae82921SPaul Mullowney } 17009ae82921SPaul Mullowney PetscFunctionReturn(0); 17019ae82921SPaul Mullowney } 17029ae82921SPaul Mullowney 1703c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1704aa372e3fSPaul Mullowney { 1705aa372e3fSPaul Mullowney template <typename Tuple> 1706aa372e3fSPaul Mullowney __host__ __device__ 1707aa372e3fSPaul Mullowney void operator()(Tuple t) 1708aa372e3fSPaul Mullowney { 1709aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1710aa372e3fSPaul Mullowney } 1711aa372e3fSPaul Mullowney }; 1712aa372e3fSPaul Mullowney 1713e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1714e6e9a74fSStefano Zampini { 1715e6e9a74fSStefano Zampini template <typename Tuple> 1716e6e9a74fSStefano Zampini __host__ __device__ 1717e6e9a74fSStefano Zampini void operator()(Tuple t) 1718e6e9a74fSStefano Zampini { 1719e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1720e6e9a74fSStefano Zampini } 1721e6e9a74fSStefano Zampini }; 1722e6e9a74fSStefano Zampini 1723afb2bd1cSJunchao Zhang struct MatMatCusparse { 1724ccdfe979SStefano Zampini PetscBool cisdense; 1725ccdfe979SStefano Zampini PetscScalar *Bt; 1726ccdfe979SStefano Zampini Mat X; 1727afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1728afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1729afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1730afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1731afb2bd1cSJunchao Zhang size_t spmmBufferSize; 1732afb2bd1cSJunchao Zhang void *spmmBuffer; 1733afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1734afb2bd1cSJunchao Zhang #endif 1735afb2bd1cSJunchao Zhang }; 1736ccdfe979SStefano Zampini 1737ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1738ccdfe979SStefano Zampini { 1739ccdfe979SStefano Zampini PetscErrorCode ierr; 1740ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1741ccdfe979SStefano Zampini cudaError_t cerr; 1742ccdfe979SStefano Zampini 1743ccdfe979SStefano Zampini PetscFunctionBegin; 1744ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1745afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1746afb2bd1cSJunchao Zhang cusparseStatus_t stat; 1747afb2bd1cSJunchao Zhang if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1748afb2bd1cSJunchao Zhang if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1749afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1750afb2bd1cSJunchao Zhang #endif 1751ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1752ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1753ccdfe979SStefano Zampini PetscFunctionReturn(0); 1754ccdfe979SStefano Zampini } 1755ccdfe979SStefano Zampini 1756ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1757ccdfe979SStefano Zampini 1758ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1759ccdfe979SStefano Zampini { 1760ccdfe979SStefano Zampini Mat_Product *product = C->product; 1761ccdfe979SStefano Zampini Mat A,B; 1762afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1763ccdfe979SStefano Zampini PetscBool flg,biscuda; 1764ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1765ccdfe979SStefano Zampini cusparseStatus_t stat; 1766ccdfe979SStefano Zampini cusparseOperation_t opA; 1767ccdfe979SStefano Zampini const PetscScalar *barray; 1768ccdfe979SStefano Zampini PetscScalar *carray; 1769ccdfe979SStefano Zampini PetscErrorCode ierr; 1770ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1771ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1772ccdfe979SStefano Zampini CsrMatrix *csrmat; 1773afb2bd1cSJunchao Zhang cudaError_t cerr; 1774ccdfe979SStefano Zampini 1775ccdfe979SStefano Zampini PetscFunctionBegin; 1776ccdfe979SStefano Zampini MatCheckProduct(C,1); 1777ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1778ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1779ccdfe979SStefano Zampini A = product->A; 1780ccdfe979SStefano Zampini B = product->B; 1781ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1782ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1783ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1784ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1785ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1786ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1787ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1788ccdfe979SStefano Zampini switch (product->type) { 1789ccdfe979SStefano Zampini case MATPRODUCT_AB: 1790ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1791ccdfe979SStefano Zampini mat = cusp->mat; 1792ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1793ccdfe979SStefano Zampini m = A->rmap->n; 1794ccdfe979SStefano Zampini n = B->cmap->n; 1795ccdfe979SStefano Zampini break; 1796ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1797e6e9a74fSStefano Zampini if (!cusp->transgen) { 1798e6e9a74fSStefano Zampini mat = cusp->mat; 1799e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1800e6e9a74fSStefano Zampini } else { 1801ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1802ccdfe979SStefano Zampini mat = cusp->matTranspose; 1803ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1804e6e9a74fSStefano Zampini } 1805ccdfe979SStefano Zampini m = A->cmap->n; 1806ccdfe979SStefano Zampini n = B->cmap->n; 1807ccdfe979SStefano Zampini break; 1808ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1809ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1810ccdfe979SStefano Zampini mat = cusp->mat; 1811ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1812ccdfe979SStefano Zampini m = A->rmap->n; 1813ccdfe979SStefano Zampini n = B->rmap->n; 1814ccdfe979SStefano Zampini break; 1815ccdfe979SStefano Zampini default: 1816ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1817ccdfe979SStefano Zampini } 1818ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1819ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1820ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1821ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1822afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1823ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1824afb2bd1cSJunchao Zhang 1825ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1826c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1827c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1828c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1829c8378d12SStefano Zampini } else { 1830c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1831c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1832c8378d12SStefano Zampini } 1833c8378d12SStefano Zampini 1834c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1835afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1836afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1837afb2bd1cSJunchao Zhang /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1838afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1839afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1840afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 1841afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1842afb2bd1cSJunchao Zhang mmdata->Blda = blda; 1843afb2bd1cSJunchao Zhang } 1844c8378d12SStefano Zampini 1845afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1846afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1847afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1848afb2bd1cSJunchao Zhang mmdata->Clda = clda; 1849afb2bd1cSJunchao Zhang } 1850afb2bd1cSJunchao Zhang 1851afb2bd1cSJunchao Zhang if (!mat->matDescr) { 1852afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 1853afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1854afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1855afb2bd1cSJunchao Zhang csrmat->values->data().get(), 1856afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1857afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1858afb2bd1cSJunchao Zhang } 1859afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1860afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1861afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 1862afb2bd1cSJunchao Zhang cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1863afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1864afb2bd1cSJunchao Zhang cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1865afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 1866afb2bd1cSJunchao Zhang } else { 1867afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 1868afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1869afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1870afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1871afb2bd1cSJunchao Zhang } 1872afb2bd1cSJunchao Zhang 1873afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 1874afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1875afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1876afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 1877afb2bd1cSJunchao Zhang cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1878afb2bd1cSJunchao Zhang #else 1879afb2bd1cSJunchao Zhang PetscInt k; 1880afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 1881ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1882ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 1883ccdfe979SStefano Zampini cublasStatus_t cerr; 1884ccdfe979SStefano Zampini 1885ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1886ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1887ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 1888ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 1889ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 1890ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1891ccdfe979SStefano Zampini blda = B->cmap->n; 1892afb2bd1cSJunchao Zhang k = B->cmap->n; 1893afb2bd1cSJunchao Zhang } else { 1894afb2bd1cSJunchao Zhang k = B->rmap->n; 1895ccdfe979SStefano Zampini } 1896ccdfe979SStefano Zampini 1897afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1898ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1899afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 1900ccdfe979SStefano Zampini csrmat->values->data().get(), 1901ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 1902ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 1903ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1904ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 1905afb2bd1cSJunchao Zhang #endif 1906afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1907c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1908c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1909ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1910ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 1911ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1912ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1913ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 1914ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1915ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1916ccdfe979SStefano Zampini } else { 1917ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1918ccdfe979SStefano Zampini } 1919ccdfe979SStefano Zampini if (mmdata->cisdense) { 1920ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1921ccdfe979SStefano Zampini } 1922ccdfe979SStefano Zampini if (!biscuda) { 1923ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1924ccdfe979SStefano Zampini } 1925ccdfe979SStefano Zampini PetscFunctionReturn(0); 1926ccdfe979SStefano Zampini } 1927ccdfe979SStefano Zampini 1928ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1929ccdfe979SStefano Zampini { 1930ccdfe979SStefano Zampini Mat_Product *product = C->product; 1931ccdfe979SStefano Zampini Mat A,B; 1932ccdfe979SStefano Zampini PetscInt m,n; 1933ccdfe979SStefano Zampini PetscBool cisdense,flg; 1934ccdfe979SStefano Zampini PetscErrorCode ierr; 1935ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1936ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1937ccdfe979SStefano Zampini 1938ccdfe979SStefano Zampini PetscFunctionBegin; 1939ccdfe979SStefano Zampini MatCheckProduct(C,1); 1940ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1941ccdfe979SStefano Zampini A = product->A; 1942ccdfe979SStefano Zampini B = product->B; 1943ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1944ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1945ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1946ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1947ccdfe979SStefano Zampini switch (product->type) { 1948ccdfe979SStefano Zampini case MATPRODUCT_AB: 1949ccdfe979SStefano Zampini m = A->rmap->n; 1950ccdfe979SStefano Zampini n = B->cmap->n; 1951ccdfe979SStefano Zampini break; 1952ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1953ccdfe979SStefano Zampini m = A->cmap->n; 1954ccdfe979SStefano Zampini n = B->cmap->n; 1955ccdfe979SStefano Zampini break; 1956ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1957ccdfe979SStefano Zampini m = A->rmap->n; 1958ccdfe979SStefano Zampini n = B->rmap->n; 1959ccdfe979SStefano Zampini break; 1960ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1961ccdfe979SStefano Zampini m = B->cmap->n; 1962ccdfe979SStefano Zampini n = B->cmap->n; 1963ccdfe979SStefano Zampini break; 1964ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1965ccdfe979SStefano Zampini m = B->rmap->n; 1966ccdfe979SStefano Zampini n = B->rmap->n; 1967ccdfe979SStefano Zampini break; 1968ccdfe979SStefano Zampini default: 1969ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1970ccdfe979SStefano Zampini } 1971ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1972ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1973ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1974ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1975ccdfe979SStefano Zampini 1976ccdfe979SStefano Zampini /* product data */ 1977ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1978ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 1979afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 1980afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 1981ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1982afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1983ccdfe979SStefano Zampini } 1984afb2bd1cSJunchao Zhang #endif 1985ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 1986ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1987ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1988ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1989ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1990ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1991ccdfe979SStefano Zampini } else { 1992ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1993ccdfe979SStefano Zampini } 1994ccdfe979SStefano Zampini } 1995ccdfe979SStefano Zampini C->product->data = mmdata; 1996ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 1997ccdfe979SStefano Zampini 1998ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1999ccdfe979SStefano Zampini PetscFunctionReturn(0); 2000ccdfe979SStefano Zampini } 2001ccdfe979SStefano Zampini 2002ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2003ccdfe979SStefano Zampini 2004ccdfe979SStefano Zampini /* handles dense B */ 2005ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2006ccdfe979SStefano Zampini { 2007ccdfe979SStefano Zampini Mat_Product *product = C->product; 2008ccdfe979SStefano Zampini PetscErrorCode ierr; 2009ccdfe979SStefano Zampini 2010ccdfe979SStefano Zampini PetscFunctionBegin; 2011ccdfe979SStefano Zampini MatCheckProduct(C,1); 2012ccdfe979SStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2013ccdfe979SStefano Zampini if (product->A->boundtocpu) { 2014ccdfe979SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2015ccdfe979SStefano Zampini PetscFunctionReturn(0); 2016ccdfe979SStefano Zampini } 2017ccdfe979SStefano Zampini switch (product->type) { 2018ccdfe979SStefano Zampini case MATPRODUCT_AB: 2019ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2020ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2021ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2022ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2023ccdfe979SStefano Zampini C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2024ccdfe979SStefano Zampini default: 2025ccdfe979SStefano Zampini break; 2026ccdfe979SStefano Zampini } 2027ccdfe979SStefano Zampini PetscFunctionReturn(0); 2028ccdfe979SStefano Zampini } 2029ccdfe979SStefano Zampini 20306fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 20319ae82921SPaul Mullowney { 2032b175d8bbSPaul Mullowney PetscErrorCode ierr; 20339ae82921SPaul Mullowney 20349ae82921SPaul Mullowney PetscFunctionBegin; 2035e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2036e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2037e6e9a74fSStefano Zampini } 2038e6e9a74fSStefano Zampini 2039e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2040e6e9a74fSStefano Zampini { 2041e6e9a74fSStefano Zampini PetscErrorCode ierr; 2042e6e9a74fSStefano Zampini 2043e6e9a74fSStefano Zampini PetscFunctionBegin; 2044e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2045e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2046e6e9a74fSStefano Zampini } 2047e6e9a74fSStefano Zampini 2048e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2049e6e9a74fSStefano Zampini { 2050e6e9a74fSStefano Zampini PetscErrorCode ierr; 2051e6e9a74fSStefano Zampini 2052e6e9a74fSStefano Zampini PetscFunctionBegin; 2053e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2054e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2055e6e9a74fSStefano Zampini } 2056e6e9a74fSStefano Zampini 2057e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2058e6e9a74fSStefano Zampini { 2059e6e9a74fSStefano Zampini PetscErrorCode ierr; 2060e6e9a74fSStefano Zampini 2061e6e9a74fSStefano Zampini PetscFunctionBegin; 2062e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 20639ae82921SPaul Mullowney PetscFunctionReturn(0); 20649ae82921SPaul Mullowney } 20659ae82921SPaul Mullowney 20666fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2067ca45077fSPaul Mullowney { 2068b175d8bbSPaul Mullowney PetscErrorCode ierr; 2069ca45077fSPaul Mullowney 2070ca45077fSPaul Mullowney PetscFunctionBegin; 2071e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2072ca45077fSPaul Mullowney PetscFunctionReturn(0); 2073ca45077fSPaul Mullowney } 2074ca45077fSPaul Mullowney 2075afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2076e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 20779ae82921SPaul Mullowney { 20789ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2079aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 20809ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2081e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2082b175d8bbSPaul Mullowney PetscErrorCode ierr; 208357d48284SJunchao Zhang cudaError_t cerr; 2084aa372e3fSPaul Mullowney cusparseStatus_t stat; 2085e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2086e6e9a74fSStefano Zampini PetscBool compressed; 2087afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088afb2bd1cSJunchao Zhang PetscInt nx,ny; 2089afb2bd1cSJunchao Zhang #endif 20906e111a19SKarl Rupp 20919ae82921SPaul Mullowney PetscFunctionBegin; 2092e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2093e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2094afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2095e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2096e6e9a74fSStefano Zampini } 209734d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 209834d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2099e6e9a74fSStefano Zampini if (!trans) { 21009ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2101e6e9a74fSStefano Zampini } else { 2102e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 2103e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2104e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2105e6e9a74fSStefano Zampini } else { 2106afb2bd1cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2107e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2108e6e9a74fSStefano Zampini } 2109e6e9a74fSStefano Zampini } 2110e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2111e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2112213423ffSJunchao Zhang 2113e6e9a74fSStefano Zampini try { 2114e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2115213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2116213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2117afb2bd1cSJunchao Zhang 211885ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2119e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2120afb2bd1cSJunchao Zhang /* z = A x + beta y. 2121afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2122afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2123afb2bd1cSJunchao Zhang */ 2124e6e9a74fSStefano Zampini xptr = xarray; 2125afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2126213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2127afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2128afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2129afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2130afb2bd1cSJunchao Zhang */ 2131afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2132afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2133afb2bd1cSJunchao Zhang nx = mat->num_cols; 2134afb2bd1cSJunchao Zhang ny = mat->num_rows; 2135afb2bd1cSJunchao Zhang } 2136afb2bd1cSJunchao Zhang #endif 2137e6e9a74fSStefano Zampini } else { 2138afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2139afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2140afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2141afb2bd1cSJunchao Zhang */ 2142afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2143e6e9a74fSStefano Zampini dptr = zarray; 2144e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2145afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2146e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2147e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2148e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2149e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2150e6e9a74fSStefano Zampini } 2151afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2152afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2153afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2154afb2bd1cSJunchao Zhang nx = mat->num_rows; 2155afb2bd1cSJunchao Zhang ny = mat->num_cols; 2156afb2bd1cSJunchao Zhang } 2157afb2bd1cSJunchao Zhang #endif 2158e6e9a74fSStefano Zampini } 21599ae82921SPaul Mullowney 2160afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2161aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2162afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2163afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2164afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2165afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2166afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2167afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2168afb2bd1cSJunchao Zhang matstruct->matDescr, 2169afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2170afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2171afb2bd1cSJunchao Zhang cusparse_scalartype, 2172afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2173afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2174afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2175afb2bd1cSJunchao Zhang 2176afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2177afb2bd1cSJunchao Zhang } else { 2178afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2179afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2180afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2181afb2bd1cSJunchao Zhang } 2182afb2bd1cSJunchao Zhang 2183afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2184afb2bd1cSJunchao Zhang matstruct->alpha_one, 2185afb2bd1cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2186afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2187afb2bd1cSJunchao Zhang beta, 2188afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2189afb2bd1cSJunchao Zhang cusparse_scalartype, 2190afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2191afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2192afb2bd1cSJunchao Zhang #else 21937656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2194e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2195a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2196afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2197aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2198e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 219957d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2200afb2bd1cSJunchao Zhang #endif 2201aa372e3fSPaul Mullowney } else { 2202213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2203afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2204afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2205afb2bd1cSJunchao Zhang #else 2206301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2207e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2208afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2209e6e9a74fSStefano Zampini xptr, beta, 221057d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2211afb2bd1cSJunchao Zhang #endif 2212a65300a6SPaul Mullowney } 2213aa372e3fSPaul Mullowney } 221405035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2215958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2216aa372e3fSPaul Mullowney 2217e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2218213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2219213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2220213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2221e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2222213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 22237656d835SStefano Zampini } 2224213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2225c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 22267656d835SStefano Zampini } 22277656d835SStefano Zampini 2228213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2229213423ffSJunchao Zhang if (compressed) { 2230213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2231e6e9a74fSStefano Zampini 2232e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2233c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2234e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2235c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 223605035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2237958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2238e6e9a74fSStefano Zampini } 2239e6e9a74fSStefano Zampini } else { 2240e6e9a74fSStefano Zampini if (yy && yy != zz) { 2241e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2242e6e9a74fSStefano Zampini } 2243e6e9a74fSStefano Zampini } 2244e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2245213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2246213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 22479ae82921SPaul Mullowney } catch(char *ex) { 22489ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 22499ae82921SPaul Mullowney } 2250e6e9a74fSStefano Zampini if (yy) { 2251958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2252e6e9a74fSStefano Zampini } else { 2253e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2254e6e9a74fSStefano Zampini } 22559ae82921SPaul Mullowney PetscFunctionReturn(0); 22569ae82921SPaul Mullowney } 22579ae82921SPaul Mullowney 22586fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2259ca45077fSPaul Mullowney { 2260b175d8bbSPaul Mullowney PetscErrorCode ierr; 22616e111a19SKarl Rupp 2262ca45077fSPaul Mullowney PetscFunctionBegin; 2263e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2264ca45077fSPaul Mullowney PetscFunctionReturn(0); 2265ca45077fSPaul Mullowney } 2266ca45077fSPaul Mullowney 22676fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 22689ae82921SPaul Mullowney { 22699ae82921SPaul Mullowney PetscErrorCode ierr; 22703fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 22713fa6b06aSMark Adams PetscBool is_seq = PETSC_TRUE; 22723fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 22739ae82921SPaul Mullowney PetscFunctionBegin; 2274bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 22753fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2276bc3f50f2SPaul Mullowney } 22773fa6b06aSMark Adams if (d_mat) { 22783fa6b06aSMark Adams cudaError_t err; 22793fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 22803fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 22813fa6b06aSMark Adams nnz_state = h_mat.nonzerostate; 22823fa6b06aSMark Adams is_seq = h_mat.seq; 22833fa6b06aSMark Adams } 22843fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 22853fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 22863fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 22873fa6b06aSMark Adams ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 22883fa6b06aSMark Adams } else if (nnz_state > A->nonzerostate) { 22893fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 22903fa6b06aSMark Adams } 22913fa6b06aSMark Adams 22929ae82921SPaul Mullowney PetscFunctionReturn(0); 22939ae82921SPaul Mullowney } 22949ae82921SPaul Mullowney 22959ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2296e057df02SPaul Mullowney /*@ 22979ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2298e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2299e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2300e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2301e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2302e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 23039ae82921SPaul Mullowney 2304d083f849SBarry Smith Collective 23059ae82921SPaul Mullowney 23069ae82921SPaul Mullowney Input Parameters: 23079ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 23089ae82921SPaul Mullowney . m - number of rows 23099ae82921SPaul Mullowney . n - number of columns 23109ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 23119ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 23120298fd71SBarry Smith (possibly different for each row) or NULL 23139ae82921SPaul Mullowney 23149ae82921SPaul Mullowney Output Parameter: 23159ae82921SPaul Mullowney . A - the matrix 23169ae82921SPaul Mullowney 23179ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 23189ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 23199ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 23209ae82921SPaul Mullowney 23219ae82921SPaul Mullowney Notes: 23229ae82921SPaul Mullowney If nnz is given then nz is ignored 23239ae82921SPaul Mullowney 23249ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 23259ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 23269ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 23279ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 23289ae82921SPaul Mullowney 23299ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 23300298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 23319ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 23329ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 23339ae82921SPaul Mullowney 23349ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 23359ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 23369ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 23379ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 23389ae82921SPaul Mullowney 23399ae82921SPaul Mullowney Level: intermediate 23409ae82921SPaul Mullowney 2341e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 23429ae82921SPaul Mullowney @*/ 23439ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 23449ae82921SPaul Mullowney { 23459ae82921SPaul Mullowney PetscErrorCode ierr; 23469ae82921SPaul Mullowney 23479ae82921SPaul Mullowney PetscFunctionBegin; 23489ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 23499ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 23509ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 23519ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 23529ae82921SPaul Mullowney PetscFunctionReturn(0); 23539ae82921SPaul Mullowney } 23549ae82921SPaul Mullowney 23556fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 23569ae82921SPaul Mullowney { 23579ae82921SPaul Mullowney PetscErrorCode ierr; 23583fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2359ab25e6cbSDominic Meiser 23609ae82921SPaul Mullowney PetscFunctionBegin; 23619ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 23623fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 23633fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2364470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 23659ae82921SPaul Mullowney } else { 2366470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2367aa372e3fSPaul Mullowney } 23683fa6b06aSMark Adams if (d_mat) { 23693fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 23703fa6b06aSMark Adams cudaError_t err; 23713fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 23723fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 23733fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 23743fa6b06aSMark Adams if (h_mat.seq) { 23753fa6b06aSMark Adams if (a->compressedrow.use) { 23763fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 23773fa6b06aSMark Adams } 23783fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 23793fa6b06aSMark Adams } 23803fa6b06aSMark Adams } 2381ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2382ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2383ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2384ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 23859ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 23869ae82921SPaul Mullowney PetscFunctionReturn(0); 23879ae82921SPaul Mullowney } 23889ae82921SPaul Mullowney 2389ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 239095639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 23919ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 23929ff858a8SKarl Rupp { 23939ff858a8SKarl Rupp PetscErrorCode ierr; 23949ff858a8SKarl Rupp 23959ff858a8SKarl Rupp PetscFunctionBegin; 23969ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2397ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 23989ff858a8SKarl Rupp PetscFunctionReturn(0); 23999ff858a8SKarl Rupp } 24009ff858a8SKarl Rupp 240195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 240295639643SRichard Tran Mills { 2403c58ef05eSStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2404e6e9a74fSStefano Zampini PetscErrorCode ierr; 2405e6e9a74fSStefano Zampini 240695639643SRichard Tran Mills PetscFunctionBegin; 2407e6e9a74fSStefano Zampini if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2408c34f1ff0SRichard Tran Mills /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 240995639643SRichard Tran Mills If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 2410e6e9a74fSStefano Zampini Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 241195639643SRichard Tran Mills TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 241295639643SRichard Tran Mills can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 2413e6e9a74fSStefano Zampini if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 2414e6e9a74fSStefano Zampini /* TODO: add support for this? */ 241595639643SRichard Tran Mills if (flg) { 241695639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJ; 241795639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJ; 2418c34f1ff0SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2419c34f1ff0SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2420e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = NULL; 2421e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = NULL; 2422e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2423e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 242495639643SRichard Tran Mills } else { 242595639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJCUSPARSE; 242695639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 242795639643SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 242895639643SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2429e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2430e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2431e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2432e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 243395639643SRichard Tran Mills } 243495639643SRichard Tran Mills A->boundtocpu = flg; 2435c58ef05eSStefano Zampini a->inode.use = flg; 243695639643SRichard Tran Mills PetscFunctionReturn(0); 243795639643SRichard Tran Mills } 243895639643SRichard Tran Mills 24393fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 24403fa6b06aSMark Adams { 24413fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 24423fa6b06aSMark Adams PetscErrorCode ierr; 24433fa6b06aSMark Adams PetscFunctionBegin; 24443fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 24453fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 24463fa6b06aSMark Adams d_mat = spptr->deviceMat; 24473fa6b06aSMark Adams } 24483fa6b06aSMark Adams if (d_mat) { 24493fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 24503fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz = a->i[n]; 24513fa6b06aSMark Adams cudaError_t err; 24523fa6b06aSMark Adams PetscScalar *vals; 24533fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 24543fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 24553fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 24563fa6b06aSMark Adams } 24573fa6b06aSMark Adams ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 24583fa6b06aSMark Adams 24593fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_BOTH; 24603fa6b06aSMark Adams 24613fa6b06aSMark Adams PetscFunctionReturn(0); 24623fa6b06aSMark Adams } 24633fa6b06aSMark Adams 246449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 24659ae82921SPaul Mullowney { 24669ae82921SPaul Mullowney PetscErrorCode ierr; 2467aa372e3fSPaul Mullowney cusparseStatus_t stat; 246849735bf3SStefano Zampini Mat B; 24699ae82921SPaul Mullowney 24709ae82921SPaul Mullowney PetscFunctionBegin; 247149735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 247249735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 247349735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 247449735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 247549735bf3SStefano Zampini } 247649735bf3SStefano Zampini B = *newmat; 247749735bf3SStefano Zampini 247834136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 247934136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 248034136279SStefano Zampini 248149735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 24829ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 2483e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 2484e6e9a74fSStefano Zampini 2485e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2486e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 2487e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2488e6e9a74fSStefano Zampini B->spptr = spptr; 24893fa6b06aSMark Adams spptr->deviceMat = NULL; 24909ae82921SPaul Mullowney } else { 2491e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 2492e6e9a74fSStefano Zampini 2493e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2494e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2495e6e9a74fSStefano Zampini B->spptr = spptr; 24969ae82921SPaul Mullowney } 2497e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 249849735bf3SStefano Zampini } 2499693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 25009ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 25019ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 250295639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2503693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 25043fa6b06aSMark Adams B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 25052205254eSKarl Rupp 2506e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 25079ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2508bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 25099ae82921SPaul Mullowney PetscFunctionReturn(0); 25109ae82921SPaul Mullowney } 25119ae82921SPaul Mullowney 251202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 251302fe1965SBarry Smith { 251402fe1965SBarry Smith PetscErrorCode ierr; 251502fe1965SBarry Smith 251602fe1965SBarry Smith PetscFunctionBegin; 251705035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 251802fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 25190ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2520afb2bd1cSJunchao Zhang ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2521afb2bd1cSJunchao Zhang ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2522afb2bd1cSJunchao Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 252302fe1965SBarry Smith PetscFunctionReturn(0); 252402fe1965SBarry Smith } 252502fe1965SBarry Smith 25263ca39a21SBarry Smith /*MC 2527e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2528e057df02SPaul Mullowney 2529e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 25302692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 25312692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2532e057df02SPaul Mullowney 2533e057df02SPaul Mullowney Options Database Keys: 2534e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2535aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2536a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2537e057df02SPaul Mullowney 2538e057df02SPaul Mullowney Level: beginner 2539e057df02SPaul Mullowney 25408468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2541e057df02SPaul Mullowney M*/ 25427f756511SDominic Meiser 254342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 254442c9c57cSBarry Smith 25450f39cd5aSBarry Smith 25463ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 254742c9c57cSBarry Smith { 254842c9c57cSBarry Smith PetscErrorCode ierr; 254942c9c57cSBarry Smith 255042c9c57cSBarry Smith PetscFunctionBegin; 25513ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 25523ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 25533ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 25543ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 255542c9c57cSBarry Smith PetscFunctionReturn(0); 255642c9c57cSBarry Smith } 255729b38603SBarry Smith 2558470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 25597f756511SDominic Meiser { 2560e6e9a74fSStefano Zampini PetscErrorCode ierr; 25617f756511SDominic Meiser cusparseStatus_t stat; 25627f756511SDominic Meiser cusparseHandle_t handle; 25637f756511SDominic Meiser 25647f756511SDominic Meiser PetscFunctionBegin; 25657f756511SDominic Meiser if (*cusparsestruct) { 2566e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2567e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 25687f756511SDominic Meiser delete (*cusparsestruct)->workVector; 256981902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 2570afb2bd1cSJunchao Zhang if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);} 2571afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2572afb2bd1cSJunchao Zhang cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2573afb2bd1cSJunchao Zhang #endif 2574e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 25757f756511SDominic Meiser } 25767f756511SDominic Meiser PetscFunctionReturn(0); 25777f756511SDominic Meiser } 25787f756511SDominic Meiser 25797f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 25807f756511SDominic Meiser { 25817f756511SDominic Meiser PetscFunctionBegin; 25827f756511SDominic Meiser if (*mat) { 25837f756511SDominic Meiser delete (*mat)->values; 25847f756511SDominic Meiser delete (*mat)->column_indices; 25857f756511SDominic Meiser delete (*mat)->row_offsets; 25867f756511SDominic Meiser delete *mat; 25877f756511SDominic Meiser *mat = 0; 25887f756511SDominic Meiser } 25897f756511SDominic Meiser PetscFunctionReturn(0); 25907f756511SDominic Meiser } 25917f756511SDominic Meiser 2592470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 25937f756511SDominic Meiser { 25947f756511SDominic Meiser cusparseStatus_t stat; 25957f756511SDominic Meiser PetscErrorCode ierr; 25967f756511SDominic Meiser 25977f756511SDominic Meiser PetscFunctionBegin; 25987f756511SDominic Meiser if (*trifactor) { 259957d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2600afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 26017f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2602afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2603afb2bd1cSJunchao Zhang cudaError_t cerr; 2604afb2bd1cSJunchao Zhang if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2605afb2bd1cSJunchao Zhang if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2606afb2bd1cSJunchao Zhang #endif 26077f756511SDominic Meiser delete *trifactor; 26087f756511SDominic Meiser *trifactor = 0; 26097f756511SDominic Meiser } 26107f756511SDominic Meiser PetscFunctionReturn(0); 26117f756511SDominic Meiser } 26127f756511SDominic Meiser 2613470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 26147f756511SDominic Meiser { 26157f756511SDominic Meiser CsrMatrix *mat; 26167f756511SDominic Meiser cusparseStatus_t stat; 26177f756511SDominic Meiser cudaError_t err; 26187f756511SDominic Meiser 26197f756511SDominic Meiser PetscFunctionBegin; 26207f756511SDominic Meiser if (*matstruct) { 26217f756511SDominic Meiser if ((*matstruct)->mat) { 26227f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2623afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2624afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2625afb2bd1cSJunchao Zhang #else 26267f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 262757d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2628afb2bd1cSJunchao Zhang #endif 26297f756511SDominic Meiser } else { 26307f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 26317f756511SDominic Meiser CsrMatrix_Destroy(&mat); 26327f756511SDominic Meiser } 26337f756511SDominic Meiser } 263457d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 26357f756511SDominic Meiser delete (*matstruct)->cprowIndices; 2636afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 26377656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 26387656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2639afb2bd1cSJunchao Zhang 2640afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2641afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2642afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2643afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 2644afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 2645afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2646afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2647afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2648afb2bd1cSJunchao Zhang } 2649afb2bd1cSJunchao Zhang } 2650afb2bd1cSJunchao Zhang #endif 26517f756511SDominic Meiser delete *matstruct; 26527f756511SDominic Meiser *matstruct = 0; 26537f756511SDominic Meiser } 26547f756511SDominic Meiser PetscFunctionReturn(0); 26557f756511SDominic Meiser } 26567f756511SDominic Meiser 2657ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 26587f756511SDominic Meiser { 2659e6e9a74fSStefano Zampini PetscErrorCode ierr; 2660e6e9a74fSStefano Zampini 26617f756511SDominic Meiser PetscFunctionBegin; 26627f756511SDominic Meiser if (*trifactors) { 2663e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2664e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2665e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2666e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 26677f756511SDominic Meiser delete (*trifactors)->rpermIndices; 26687f756511SDominic Meiser delete (*trifactors)->cpermIndices; 26697f756511SDominic Meiser delete (*trifactors)->workVector; 2670ccdfe979SStefano Zampini (*trifactors)->rpermIndices = 0; 2671ccdfe979SStefano Zampini (*trifactors)->cpermIndices = 0; 2672ccdfe979SStefano Zampini (*trifactors)->workVector = 0; 2673ccdfe979SStefano Zampini } 2674ccdfe979SStefano Zampini PetscFunctionReturn(0); 2675ccdfe979SStefano Zampini } 2676ccdfe979SStefano Zampini 2677ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2678ccdfe979SStefano Zampini { 2679e6e9a74fSStefano Zampini PetscErrorCode ierr; 2680ccdfe979SStefano Zampini cusparseHandle_t handle; 2681ccdfe979SStefano Zampini cusparseStatus_t stat; 2682ccdfe979SStefano Zampini 2683ccdfe979SStefano Zampini PetscFunctionBegin; 2684ccdfe979SStefano Zampini if (*trifactors) { 2685e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 26867f756511SDominic Meiser if (handle = (*trifactors)->handle) { 268757d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 26887f756511SDominic Meiser } 2689e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 26907f756511SDominic Meiser } 26917f756511SDominic Meiser PetscFunctionReturn(0); 26927f756511SDominic Meiser } 2693