19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 749ae82921SPaul Mullowney 757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 817f756511SDominic Meiser 827e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 847e8381f9SStefano Zampini 85b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86b06137fdSPaul Mullowney { 87b06137fdSPaul Mullowney cusparseStatus_t stat; 88b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89b06137fdSPaul Mullowney 90b06137fdSPaul Mullowney PetscFunctionBegin; 91d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92b06137fdSPaul Mullowney cusparsestruct->stream = stream; 9357d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94b06137fdSPaul Mullowney PetscFunctionReturn(0); 95b06137fdSPaul Mullowney } 96b06137fdSPaul Mullowney 97b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98b06137fdSPaul Mullowney { 99b06137fdSPaul Mullowney cusparseStatus_t stat; 100b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101b06137fdSPaul Mullowney 102b06137fdSPaul Mullowney PetscFunctionBegin; 103d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1046b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 10516a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 10657d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 10716a2e217SAlejandro Lamas Daviña } 108b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1096b1cf21dSAlejandro Lamas Daviña } 11057d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111b06137fdSPaul Mullowney PetscFunctionReturn(0); 112b06137fdSPaul Mullowney } 113b06137fdSPaul Mullowney 114b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115b06137fdSPaul Mullowney { 116b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1177e8381f9SStefano Zampini PetscBool flg; 1187e8381f9SStefano Zampini PetscErrorCode ierr; 119ccdfe979SStefano Zampini 120b06137fdSPaul Mullowney PetscFunctionBegin; 1217e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1227e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 124b06137fdSPaul Mullowney PetscFunctionReturn(0); 125b06137fdSPaul Mullowney } 126b06137fdSPaul Mullowney 127ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1289ae82921SPaul Mullowney { 1299ae82921SPaul Mullowney PetscFunctionBegin; 1309ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1319ae82921SPaul Mullowney PetscFunctionReturn(0); 1329ae82921SPaul Mullowney } 1339ae82921SPaul Mullowney 134c708e6cdSJed Brown /*MC 135087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 141c708e6cdSJed Brown 1429ae82921SPaul Mullowney Level: beginner 143c708e6cdSJed Brown 1443ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145c708e6cdSJed Brown M*/ 1469ae82921SPaul Mullowney 14742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1489ae82921SPaul Mullowney { 1499ae82921SPaul Mullowney PetscErrorCode ierr; 150bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1519ae82921SPaul Mullowney 1529ae82921SPaul Mullowney PetscFunctionBegin; 153bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1552c7c0729SBarry Smith (*B)->factortype = ftype; 1562c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1579ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1582205254eSKarl Rupp 159087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16033d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1619ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1629ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1669ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167bc3f50f2SPaul Mullowney 168fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1693ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1709ae82921SPaul Mullowney PetscFunctionReturn(0); 1719ae82921SPaul Mullowney } 1729ae82921SPaul Mullowney 173bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174ca45077fSPaul Mullowney { 175aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1766e111a19SKarl Rupp 177ca45077fSPaul Mullowney PetscFunctionBegin; 178ca45077fSPaul Mullowney switch (op) { 179e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 180aa372e3fSPaul Mullowney cusparsestruct->format = format; 181ca45077fSPaul Mullowney break; 182e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 183aa372e3fSPaul Mullowney cusparsestruct->format = format; 184ca45077fSPaul Mullowney break; 185ca45077fSPaul Mullowney default: 18636d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187ca45077fSPaul Mullowney } 188ca45077fSPaul Mullowney PetscFunctionReturn(0); 189ca45077fSPaul Mullowney } 1909ae82921SPaul Mullowney 191e057df02SPaul Mullowney /*@ 192e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 194aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 195e057df02SPaul Mullowney Not Collective 196e057df02SPaul Mullowney 197e057df02SPaul Mullowney Input Parameters: 1988468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 19936d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2002692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201e057df02SPaul Mullowney 202e057df02SPaul Mullowney Output Parameter: 203e057df02SPaul Mullowney 204e057df02SPaul Mullowney Level: intermediate 205e057df02SPaul Mullowney 2068468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207e057df02SPaul Mullowney @*/ 208e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209e057df02SPaul Mullowney { 210e057df02SPaul Mullowney PetscErrorCode ierr; 2116e111a19SKarl Rupp 212e057df02SPaul Mullowney PetscFunctionBegin; 213e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215e057df02SPaul Mullowney PetscFunctionReturn(0); 216e057df02SPaul Mullowney } 217e057df02SPaul Mullowney 218e6e9a74fSStefano Zampini /*@ 219e6e9a74fSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220e6e9a74fSStefano Zampini 221e6e9a74fSStefano Zampini Collective on mat 222e6e9a74fSStefano Zampini 223e6e9a74fSStefano Zampini Input Parameters: 224e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 225e6e9a74fSStefano Zampini - transgen - the boolean flag 226e6e9a74fSStefano Zampini 227e6e9a74fSStefano Zampini Level: intermediate 228e6e9a74fSStefano Zampini 229e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE 230e6e9a74fSStefano Zampini @*/ 231e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232e6e9a74fSStefano Zampini { 233e6e9a74fSStefano Zampini PetscErrorCode ierr; 234e6e9a74fSStefano Zampini PetscBool flg; 235e6e9a74fSStefano Zampini 236e6e9a74fSStefano Zampini PetscFunctionBegin; 237e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239e6e9a74fSStefano Zampini if (flg) { 240e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 24154da937aSStefano Zampini 242e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243e6e9a74fSStefano Zampini cusp->transgen = transgen; 24454da937aSStefano Zampini if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 24554da937aSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 24654da937aSStefano Zampini } 247e6e9a74fSStefano Zampini } 248e6e9a74fSStefano Zampini PetscFunctionReturn(0); 249e6e9a74fSStefano Zampini } 250e6e9a74fSStefano Zampini 2514416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2529ae82921SPaul Mullowney { 2539ae82921SPaul Mullowney PetscErrorCode ierr; 254e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2559ae82921SPaul Mullowney PetscBool flg; 256a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2576e111a19SKarl Rupp 2589ae82921SPaul Mullowney PetscFunctionBegin; 259e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2609ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 26154da937aSStefano Zampini PetscBool transgen = cusparsestruct->transgen; 26254da937aSStefano Zampini 26354da937aSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264afb2bd1cSJunchao Zhang if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265afb2bd1cSJunchao Zhang 266e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269afb2bd1cSJunchao Zhang 2704c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279afb2bd1cSJunchao Zhang 280afb2bd1cSJunchao Zhang cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284afb2bd1cSJunchao Zhang 285afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289afb2bd1cSJunchao Zhang #endif 2904c87dfd4SPaul Mullowney } 2910af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2929ae82921SPaul Mullowney PetscFunctionReturn(0); 2939ae82921SPaul Mullowney } 2949ae82921SPaul Mullowney 2956fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2969ae82921SPaul Mullowney { 2979ae82921SPaul Mullowney PetscErrorCode ierr; 2989ae82921SPaul Mullowney 2999ae82921SPaul Mullowney PetscFunctionBegin; 3009ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3019ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3029ae82921SPaul Mullowney PetscFunctionReturn(0); 3039ae82921SPaul Mullowney } 3049ae82921SPaul Mullowney 3056fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3069ae82921SPaul Mullowney { 3079ae82921SPaul Mullowney PetscErrorCode ierr; 3089ae82921SPaul Mullowney 3099ae82921SPaul Mullowney PetscFunctionBegin; 3109ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3119ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3129ae82921SPaul Mullowney PetscFunctionReturn(0); 3139ae82921SPaul Mullowney } 3149ae82921SPaul Mullowney 315087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 316087f3262SPaul Mullowney { 317087f3262SPaul Mullowney PetscErrorCode ierr; 318087f3262SPaul Mullowney 319087f3262SPaul Mullowney PetscFunctionBegin; 320087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 321087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 322087f3262SPaul Mullowney PetscFunctionReturn(0); 323087f3262SPaul Mullowney } 324087f3262SPaul Mullowney 325087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326087f3262SPaul Mullowney { 327087f3262SPaul Mullowney PetscErrorCode ierr; 328087f3262SPaul Mullowney 329087f3262SPaul Mullowney PetscFunctionBegin; 330087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332087f3262SPaul Mullowney PetscFunctionReturn(0); 333087f3262SPaul Mullowney } 334087f3262SPaul Mullowney 335087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3369ae82921SPaul Mullowney { 3379ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3389ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3399ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 340aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3419ae82921SPaul Mullowney cusparseStatus_t stat; 3429ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3439ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3449ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3459ae82921SPaul Mullowney PetscScalar *AALo; 3469ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 347b175d8bbSPaul Mullowney PetscErrorCode ierr; 34857d48284SJunchao Zhang cudaError_t cerr; 3499ae82921SPaul Mullowney 3509ae82921SPaul Mullowney PetscFunctionBegin; 351cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 352c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3539ae82921SPaul Mullowney try { 3549ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3559ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 3569ae82921SPaul Mullowney 3579ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 35857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 35957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 36057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3619ae82921SPaul Mullowney 3629ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3639ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3649ae82921SPaul Mullowney AiLo[n] = nzLower; 3659ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3669ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3679ae82921SPaul Mullowney v = aa; 3689ae82921SPaul Mullowney vi = aj; 3699ae82921SPaul Mullowney offset = 1; 3709ae82921SPaul Mullowney rowOffset= 1; 3719ae82921SPaul Mullowney for (i=1; i<n; i++) { 3729ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 373e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3749ae82921SPaul Mullowney AiLo[i] = rowOffset; 3759ae82921SPaul Mullowney rowOffset += nz+1; 3769ae82921SPaul Mullowney 377580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 378580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3799ae82921SPaul Mullowney 3809ae82921SPaul Mullowney offset += nz; 3819ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3829ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3839ae82921SPaul Mullowney offset += 1; 3849ae82921SPaul Mullowney 3859ae82921SPaul Mullowney v += nz; 3869ae82921SPaul Mullowney vi += nz; 3879ae82921SPaul Mullowney } 3882205254eSKarl Rupp 389aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 390aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3912205254eSKarl Rupp 392aa372e3fSPaul Mullowney /* Create the matrix description */ 39357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 39457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 395*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 396afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 397afb2bd1cSJunchao Zhang #else 39857d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 399afb2bd1cSJunchao Zhang #endif 40057d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 40157d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 402aa372e3fSPaul Mullowney 403aa372e3fSPaul Mullowney /* set the operation */ 404aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405aa372e3fSPaul Mullowney 406aa372e3fSPaul Mullowney /* set the matrix */ 407aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 408aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 409aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 410aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 411aa372e3fSPaul Mullowney 412aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 414aa372e3fSPaul Mullowney 415aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 416aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 417aa372e3fSPaul Mullowney 418aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 419aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 420aa372e3fSPaul Mullowney 421afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 422afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 423*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 424afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 425afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 428afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 429afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 430afb2bd1cSJunchao Zhang #endif 431afb2bd1cSJunchao Zhang 432aa372e3fSPaul Mullowney /* perform the solve analysis */ 433aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 434aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 435aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 436afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 437*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 438afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 439afb2bd1cSJunchao Zhang #endif 440afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 441aa372e3fSPaul Mullowney 442aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 443aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4442205254eSKarl Rupp 44557d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 44657d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 44757d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 4484863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 4499ae82921SPaul Mullowney } catch(char *ex) { 4509ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4519ae82921SPaul Mullowney } 4529ae82921SPaul Mullowney } 4539ae82921SPaul Mullowney PetscFunctionReturn(0); 4549ae82921SPaul Mullowney } 4559ae82921SPaul Mullowney 456087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4579ae82921SPaul Mullowney { 4589ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4599ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4609ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 461aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4629ae82921SPaul Mullowney cusparseStatus_t stat; 4639ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4649ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4659ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4669ae82921SPaul Mullowney PetscScalar *AAUp; 4679ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4689ae82921SPaul Mullowney PetscErrorCode ierr; 46957d48284SJunchao Zhang cudaError_t cerr; 4709ae82921SPaul Mullowney 4719ae82921SPaul Mullowney PetscFunctionBegin; 472cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 473c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4749ae82921SPaul Mullowney try { 4759ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4769ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 4779ae82921SPaul Mullowney 4789ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 47957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 48057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 48157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 4829ae82921SPaul Mullowney 4839ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4849ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4859ae82921SPaul Mullowney AiUp[n]=nzUpper; 4869ae82921SPaul Mullowney offset = nzUpper; 4879ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4889ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4899ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4909ae82921SPaul Mullowney 491e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4929ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4939ae82921SPaul Mullowney 494e057df02SPaul Mullowney /* decrement the offset */ 4959ae82921SPaul Mullowney offset -= (nz+1); 4969ae82921SPaul Mullowney 497e057df02SPaul Mullowney /* first, set the diagonal elements */ 4989ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 49909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5009ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5019ae82921SPaul Mullowney 502580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 503580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5049ae82921SPaul Mullowney } 5052205254eSKarl Rupp 506aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 507aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 5082205254eSKarl Rupp 509aa372e3fSPaul Mullowney /* Create the matrix description */ 51057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 51157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 512*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 513afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 514afb2bd1cSJunchao Zhang #else 51557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 516afb2bd1cSJunchao Zhang #endif 51757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 51857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 519aa372e3fSPaul Mullowney 520aa372e3fSPaul Mullowney /* set the operation */ 521aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 522aa372e3fSPaul Mullowney 523aa372e3fSPaul Mullowney /* set the matrix */ 524aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 525aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 526aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 527aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 528aa372e3fSPaul Mullowney 529aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 530aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 531aa372e3fSPaul Mullowney 532aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 533aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 534aa372e3fSPaul Mullowney 535aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 536aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 537aa372e3fSPaul Mullowney 538afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 539afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 540*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 541afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 542afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 545afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 546afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 547afb2bd1cSJunchao Zhang #endif 548afb2bd1cSJunchao Zhang 549aa372e3fSPaul Mullowney /* perform the solve analysis */ 550aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 551aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 552aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 553afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 554*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 555afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 556afb2bd1cSJunchao Zhang #endif 557afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 558aa372e3fSPaul Mullowney 559aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 560aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5612205254eSKarl Rupp 56257d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 56357d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 56457d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 5654863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 5669ae82921SPaul Mullowney } catch(char *ex) { 5679ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5689ae82921SPaul Mullowney } 5699ae82921SPaul Mullowney } 5709ae82921SPaul Mullowney PetscFunctionReturn(0); 5719ae82921SPaul Mullowney } 5729ae82921SPaul Mullowney 573087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 5749ae82921SPaul Mullowney { 5759ae82921SPaul Mullowney PetscErrorCode ierr; 5769ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5779ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 5789ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 5799ae82921SPaul Mullowney PetscBool row_identity,col_identity; 5809ae82921SPaul Mullowney const PetscInt *r,*c; 5819ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5829ae82921SPaul Mullowney 5839ae82921SPaul Mullowney PetscFunctionBegin; 584ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 585087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 586087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 5872205254eSKarl Rupp 588e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 589aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 5909ae82921SPaul Mullowney 591c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 592e057df02SPaul Mullowney /* lower triangular indices */ 5939ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5949ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5952205254eSKarl Rupp if (!row_identity) { 596aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 597aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 5982205254eSKarl Rupp } 5999ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6009ae82921SPaul Mullowney 601e057df02SPaul Mullowney /* upper triangular indices */ 6029ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 6039ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6042205254eSKarl Rupp if (!col_identity) { 605aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 606aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6072205254eSKarl Rupp } 6084863603aSSatish Balay 6094863603aSSatish Balay if (!row_identity && !col_identity) { 6104863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 6114863603aSSatish Balay } else if(!row_identity) { 6124863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 6134863603aSSatish Balay } else if(!col_identity) { 6144863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 6154863603aSSatish Balay } 6164863603aSSatish Balay 6179ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 6189ae82921SPaul Mullowney PetscFunctionReturn(0); 6199ae82921SPaul Mullowney } 6209ae82921SPaul Mullowney 621087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 622087f3262SPaul Mullowney { 623087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 626aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 627087f3262SPaul Mullowney cusparseStatus_t stat; 628087f3262SPaul Mullowney PetscErrorCode ierr; 62957d48284SJunchao Zhang cudaError_t cerr; 630087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 631087f3262SPaul Mullowney PetscScalar *AAUp; 632087f3262SPaul Mullowney PetscScalar *AALo; 633087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 634087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 635087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 636087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 637087f3262SPaul Mullowney 638087f3262SPaul Mullowney PetscFunctionBegin; 639cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 640c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 641087f3262SPaul Mullowney try { 642087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 64357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 64457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 64557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 64657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 647087f3262SPaul Mullowney 648087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 649087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 650087f3262SPaul Mullowney AiUp[n]=nzUpper; 651087f3262SPaul Mullowney offset = 0; 652087f3262SPaul Mullowney for (i=0; i<n; i++) { 653087f3262SPaul Mullowney /* set the pointers */ 654087f3262SPaul Mullowney v = aa + ai[i]; 655087f3262SPaul Mullowney vj = aj + ai[i]; 656087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 657087f3262SPaul Mullowney 658087f3262SPaul Mullowney /* first, set the diagonal elements */ 659087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 66009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 661087f3262SPaul Mullowney AiUp[i] = offset; 66209f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 663087f3262SPaul Mullowney 664087f3262SPaul Mullowney offset+=1; 665087f3262SPaul Mullowney if (nz>0) { 666f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 667580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 668087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 669087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 670087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 671087f3262SPaul Mullowney } 672087f3262SPaul Mullowney offset+=nz; 673087f3262SPaul Mullowney } 674087f3262SPaul Mullowney } 675087f3262SPaul Mullowney 676aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 677aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 678087f3262SPaul Mullowney 679aa372e3fSPaul Mullowney /* Create the matrix description */ 68057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 68157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 682*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 683afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 684afb2bd1cSJunchao Zhang #else 68557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 686afb2bd1cSJunchao Zhang #endif 68757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 68857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 689087f3262SPaul Mullowney 690aa372e3fSPaul Mullowney /* set the matrix */ 691aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 692aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 693aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 694aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 695aa372e3fSPaul Mullowney 696aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 697aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 698aa372e3fSPaul Mullowney 699aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 700aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 701aa372e3fSPaul Mullowney 702aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 703aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 704aa372e3fSPaul Mullowney 705afb2bd1cSJunchao Zhang /* set the operation */ 706afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 707afb2bd1cSJunchao Zhang 708afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 709afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 710*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 711afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 712afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 715afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 716afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 717afb2bd1cSJunchao Zhang #endif 718afb2bd1cSJunchao Zhang 719aa372e3fSPaul Mullowney /* perform the solve analysis */ 720aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 721aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 722aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 723afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 724*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 725afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 726afb2bd1cSJunchao Zhang #endif 727afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 728aa372e3fSPaul Mullowney 729aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 730aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 731aa372e3fSPaul Mullowney 732aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 733aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 734aa372e3fSPaul Mullowney 735aa372e3fSPaul Mullowney /* Create the matrix description */ 73657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 73757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 738*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 739afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 740afb2bd1cSJunchao Zhang #else 74157d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 742afb2bd1cSJunchao Zhang #endif 74357d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 74457d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 745aa372e3fSPaul Mullowney 746aa372e3fSPaul Mullowney /* set the operation */ 747aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 748aa372e3fSPaul Mullowney 749aa372e3fSPaul Mullowney /* set the matrix */ 750aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 751aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 752aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 753aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 754aa372e3fSPaul Mullowney 755aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 756aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 757aa372e3fSPaul Mullowney 758aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 759aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 760aa372e3fSPaul Mullowney 761aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 762aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 7634863603aSSatish Balay ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 764aa372e3fSPaul Mullowney 765afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 766afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 767*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 768afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 769afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 772afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 773afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 774afb2bd1cSJunchao Zhang #endif 775afb2bd1cSJunchao Zhang 776aa372e3fSPaul Mullowney /* perform the solve analysis */ 777aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 778aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 779aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 780afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 781*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 782afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 783afb2bd1cSJunchao Zhang #endif 784afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 785aa372e3fSPaul Mullowney 786aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 787aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 788087f3262SPaul Mullowney 789c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 79057d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 79157d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 79257d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 79357d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 794087f3262SPaul Mullowney } catch(char *ex) { 795087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 796087f3262SPaul Mullowney } 797087f3262SPaul Mullowney } 798087f3262SPaul Mullowney PetscFunctionReturn(0); 799087f3262SPaul Mullowney } 800087f3262SPaul Mullowney 801087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8029ae82921SPaul Mullowney { 8039ae82921SPaul Mullowney PetscErrorCode ierr; 804087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 805087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 806087f3262SPaul Mullowney IS ip = a->row; 807087f3262SPaul Mullowney const PetscInt *rip; 808087f3262SPaul Mullowney PetscBool perm_identity; 809087f3262SPaul Mullowney PetscInt n = A->rmap->n; 810087f3262SPaul Mullowney 811087f3262SPaul Mullowney PetscFunctionBegin; 812ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 813087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 814e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 815aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 816aa372e3fSPaul Mullowney 817087f3262SPaul Mullowney /* lower triangular indices */ 818087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 819087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 820087f3262SPaul Mullowney if (!perm_identity) { 8214e4bbfaaSStefano Zampini IS iip; 8224e4bbfaaSStefano Zampini const PetscInt *irip; 8234e4bbfaaSStefano Zampini 8244e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 8254e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 826aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 827aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 828aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8294e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 8304e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 8314e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 8324863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 833087f3262SPaul Mullowney } 834087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 835087f3262SPaul Mullowney PetscFunctionReturn(0); 836087f3262SPaul Mullowney } 837087f3262SPaul Mullowney 8386fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 8399ae82921SPaul Mullowney { 8409ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 8419ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 8429ae82921SPaul Mullowney PetscBool row_identity,col_identity; 843b175d8bbSPaul Mullowney PetscErrorCode ierr; 8449ae82921SPaul Mullowney 8459ae82921SPaul Mullowney PetscFunctionBegin; 8469ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 847ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 848e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8499ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 8509ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 851bda325fcSPaul Mullowney if (row_identity && col_identity) { 852bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 853bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8544e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8554e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 856bda325fcSPaul Mullowney } else { 857bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 858bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8594e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8604e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 861bda325fcSPaul Mullowney } 8628dc1d2a3SPaul Mullowney 863e057df02SPaul Mullowney /* get the triangular factors */ 864087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 8659ae82921SPaul Mullowney PetscFunctionReturn(0); 8669ae82921SPaul Mullowney } 8679ae82921SPaul Mullowney 868087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 869087f3262SPaul Mullowney { 870087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 871087f3262SPaul Mullowney IS ip = b->row; 872087f3262SPaul Mullowney PetscBool perm_identity; 873b175d8bbSPaul Mullowney PetscErrorCode ierr; 874087f3262SPaul Mullowney 875087f3262SPaul Mullowney PetscFunctionBegin; 876087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 877ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 878087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 879087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 880087f3262SPaul Mullowney if (perm_identity) { 881087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 882087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8834e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8844e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 885087f3262SPaul Mullowney } else { 886087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 887087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8884e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8894e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 890087f3262SPaul Mullowney } 891087f3262SPaul Mullowney 892087f3262SPaul Mullowney /* get the triangular factors */ 893087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 894087f3262SPaul Mullowney PetscFunctionReturn(0); 895087f3262SPaul Mullowney } 8969ae82921SPaul Mullowney 897b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 898bda325fcSPaul Mullowney { 899bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 900aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 901aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 902aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 903aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 904bda325fcSPaul Mullowney cusparseStatus_t stat; 905aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 906aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 907aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 908aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 909*1b0a6780SStefano Zampini cudaError_t cerr; 910b175d8bbSPaul Mullowney 911bda325fcSPaul Mullowney PetscFunctionBegin; 912bda325fcSPaul Mullowney 913aa372e3fSPaul Mullowney /*********************************************/ 914aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 915aa372e3fSPaul Mullowney /*********************************************/ 916aa372e3fSPaul Mullowney 917aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 918aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 919aa372e3fSPaul Mullowney 920aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 921aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 922aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 923aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 924aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 925aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 926aa372e3fSPaul Mullowney 927aa372e3fSPaul Mullowney /* Create the matrix description */ 92857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 92957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 93057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 93157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 93257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 933aa372e3fSPaul Mullowney 934aa372e3fSPaul Mullowney /* set the operation */ 935aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 936aa372e3fSPaul Mullowney 937aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 938aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 939afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 940afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 941aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 942afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 943afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 944afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 945aa372e3fSPaul Mullowney 946aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 947afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 948afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 949afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 950afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 951afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 952afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 953afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 954afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 955afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 956afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 957*1b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 958afb2bd1cSJunchao Zhang #endif 959afb2bd1cSJunchao Zhang 960aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 961aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 962aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 963aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 964aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 965aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 966afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 967afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 968afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 969afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 970afb2bd1cSJunchao Zhang #else 971afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 972afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 973afb2bd1cSJunchao Zhang #endif 974afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 975aa372e3fSPaul Mullowney 976afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 977afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 978*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 979afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 980afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 981afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 982afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 983afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 984afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 985afb2bd1cSJunchao Zhang #endif 986afb2bd1cSJunchao Zhang 987afb2bd1cSJunchao Zhang /* perform the solve analysis */ 988aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 989afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 990afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 991afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 992*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 993afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 994afb2bd1cSJunchao Zhang #endif 995afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 996aa372e3fSPaul Mullowney 997aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 998aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 999aa372e3fSPaul Mullowney 1000aa372e3fSPaul Mullowney /*********************************************/ 1001aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1002aa372e3fSPaul Mullowney /*********************************************/ 1003aa372e3fSPaul Mullowney 1004aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1005aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 1006aa372e3fSPaul Mullowney 1007aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1008aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1009aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1010aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1011aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1012aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1013aa372e3fSPaul Mullowney 1014aa372e3fSPaul Mullowney /* Create the matrix description */ 101557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 101657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 101757d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 101857d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 101957d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1020aa372e3fSPaul Mullowney 1021aa372e3fSPaul Mullowney /* set the operation */ 1022aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1023aa372e3fSPaul Mullowney 1024aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1025aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1026afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1027afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1028aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1029afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1030afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1031afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1032aa372e3fSPaul Mullowney 1033aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1034afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1035afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1036afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1037afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1038afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1039afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1040afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1041afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1042afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1043afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1044afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1045afb2bd1cSJunchao Zhang #endif 1046afb2bd1cSJunchao Zhang 1047aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1048aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1049aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1050aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1051aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1052aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1053afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1054afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1055afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1056afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1057afb2bd1cSJunchao Zhang #else 1058afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1059afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1060afb2bd1cSJunchao Zhang #endif 1061afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1062aa372e3fSPaul Mullowney 1063afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1064afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1065*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1067afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1068afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1069afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1070afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1071afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1072afb2bd1cSJunchao Zhang #endif 1073afb2bd1cSJunchao Zhang 1074afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1075aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1076afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1077afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1078afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1079*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1081afb2bd1cSJunchao Zhang #endif 1082afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1083aa372e3fSPaul Mullowney 1084aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 1085aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1086bda325fcSPaul Mullowney PetscFunctionReturn(0); 1087bda325fcSPaul Mullowney } 1088bda325fcSPaul Mullowney 1089b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1090bda325fcSPaul Mullowney { 1091aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1092aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1093aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1094bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1095bda325fcSPaul Mullowney cusparseStatus_t stat; 1096aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1097b06137fdSPaul Mullowney cudaError_t err; 109885ba7357SStefano Zampini PetscErrorCode ierr; 1099b175d8bbSPaul Mullowney 1100bda325fcSPaul Mullowney PetscFunctionBegin; 110185ba7357SStefano Zampini if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 110285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 110385ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 110485ba7357SStefano Zampini /* create cusparse matrix */ 1105aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 110657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1107aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 110857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 110957d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1110aa372e3fSPaul Mullowney 1111b06137fdSPaul Mullowney /* set alpha and beta */ 1112afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 11137656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 11147656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1115afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 11167656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 11177656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 111857d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1119b06137fdSPaul Mullowney 1120aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1121aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1122aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 1123554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1124554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1125aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1126a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1127aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1128aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1129a3fdcf43SKarl Rupp 113081902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 113181902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1132afb2bd1cSJunchao Zhang 113381902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 1134afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1135afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1136afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1137afb2bd1cSJunchao Zhang matrix->values->data().get(), 1138afb2bd1cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1139afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1140afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1141afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1142afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1143afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1144afb2bd1cSJunchao Zhang err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1145afb2bd1cSJunchao Zhang #endif 1146afb2bd1cSJunchao Zhang 1147a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1148a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 1149aa372e3fSPaul Mullowney matrix->values->data().get(), 115081902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1151aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1152aa372e3fSPaul Mullowney matrixT->values->data().get(), 1153afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1154afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1155afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1156afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1157afb2bd1cSJunchao Zhang #else 1158afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1159afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1160afb2bd1cSJunchao Zhang #endif 1161afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1162aa372e3fSPaul Mullowney matstructT->mat = matrixT; 1163afb2bd1cSJunchao Zhang 1164afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1165afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1166afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1167afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1168afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1169afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1170afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1171afb2bd1cSJunchao Zhang #endif 1172aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1173afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1174afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1175afb2bd1cSJunchao Zhang #else 1176aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 117751c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 117851c6d536SStefano Zampini /* First convert HYB to CSR */ 1179aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1180aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1181aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1182aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1183aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1184aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1185aa372e3fSPaul Mullowney 1186aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1187aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1188aa372e3fSPaul Mullowney temp->values->data().get(), 1189aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 119057d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1191aa372e3fSPaul Mullowney 1192aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1193aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1194aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1195aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1196aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1197aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1198aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1199aa372e3fSPaul Mullowney 1200aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1201aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1202aa372e3fSPaul Mullowney temp->values->data().get(), 1203aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1204aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1205aa372e3fSPaul Mullowney tempT->values->data().get(), 1206aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1207aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 120857d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1209aa372e3fSPaul Mullowney 1210aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1211aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 121257d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1213aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1214aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1215aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1216aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1217aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1218aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 121957d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1220aa372e3fSPaul Mullowney 1221aa372e3fSPaul Mullowney /* assign the pointer */ 1222aa372e3fSPaul Mullowney matstructT->mat = hybMat; 1223aa372e3fSPaul Mullowney /* delete temporaries */ 1224aa372e3fSPaul Mullowney if (tempT) { 1225aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1226aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1227aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1228aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1229087f3262SPaul Mullowney } 1230aa372e3fSPaul Mullowney if (temp) { 1231aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1232aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1233aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1234aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1235aa372e3fSPaul Mullowney } 1236afb2bd1cSJunchao Zhang #endif 1237aa372e3fSPaul Mullowney } 123805035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 123985ba7357SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 124085ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1241213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1242213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1243aa372e3fSPaul Mullowney /* assign the pointer */ 1244aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1245bda325fcSPaul Mullowney PetscFunctionReturn(0); 1246bda325fcSPaul Mullowney } 1247bda325fcSPaul Mullowney 12484e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 12496fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1250bda325fcSPaul Mullowney { 1251c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1252465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1253465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1254465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1255465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1256bda325fcSPaul Mullowney cusparseStatus_t stat; 1257bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1258aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1259aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1260aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1261b175d8bbSPaul Mullowney PetscErrorCode ierr; 126257d48284SJunchao Zhang cudaError_t cerr; 1263bda325fcSPaul Mullowney 1264bda325fcSPaul Mullowney PetscFunctionBegin; 1265aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1266aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1267bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1268aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1269aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1270bda325fcSPaul Mullowney } 1271bda325fcSPaul Mullowney 1272bda325fcSPaul Mullowney /* Get the GPU pointers */ 1273c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1274c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1275c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1276c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1277bda325fcSPaul Mullowney 12787a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1279aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1280c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1281c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1282c41cb2e2SAlejandro Lamas Daviña xGPU); 1283aa372e3fSPaul Mullowney 1284aa372e3fSPaul Mullowney /* First, solve U */ 1285aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1286afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 1287*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1288afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1289afb2bd1cSJunchao Zhang #endif 1290afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1291aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1292aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1293aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1294aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1295afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 1296*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1297afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1298afb2bd1cSJunchao Zhang #endif 1299afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1300aa372e3fSPaul Mullowney 1301aa372e3fSPaul Mullowney /* Then, solve L */ 1302aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1303afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 1304*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1305afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1306afb2bd1cSJunchao Zhang #endif 1307afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1308aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1309aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1310aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1311aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1312afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1313*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1314afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1315afb2bd1cSJunchao Zhang #endif 1316afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1317aa372e3fSPaul Mullowney 1318aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1319c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1320c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1321aa372e3fSPaul Mullowney tempGPU->begin()); 1322aa372e3fSPaul Mullowney 1323aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1324c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1325bda325fcSPaul Mullowney 1326bda325fcSPaul Mullowney /* restore */ 1327c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1328c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 132905035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1330661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1331958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1332bda325fcSPaul Mullowney PetscFunctionReturn(0); 1333bda325fcSPaul Mullowney } 1334bda325fcSPaul Mullowney 13356fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1336bda325fcSPaul Mullowney { 1337465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1338465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1339bda325fcSPaul Mullowney cusparseStatus_t stat; 1340bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1341aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1342aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1343aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1344b175d8bbSPaul Mullowney PetscErrorCode ierr; 134557d48284SJunchao Zhang cudaError_t cerr; 1346bda325fcSPaul Mullowney 1347bda325fcSPaul Mullowney PetscFunctionBegin; 1348aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1349aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1350bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1351aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1352aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1353bda325fcSPaul Mullowney } 1354bda325fcSPaul Mullowney 1355bda325fcSPaul Mullowney /* Get the GPU pointers */ 1356c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1357c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1358bda325fcSPaul Mullowney 13597a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1360aa372e3fSPaul Mullowney /* First, solve U */ 1361aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1362afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 1363*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1364afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1365afb2bd1cSJunchao Zhang #endif 1366afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1367aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1368aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1369aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1370aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1371afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 1372*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1373afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1374afb2bd1cSJunchao Zhang #endif 1375afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1376aa372e3fSPaul Mullowney 1377aa372e3fSPaul Mullowney /* Then, solve L */ 1378aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1379afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 1380*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1381afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1382afb2bd1cSJunchao Zhang #endif 1383afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1384aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1385aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1386aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1387aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1388afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1389*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1390afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1391afb2bd1cSJunchao Zhang #endif 1392afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1393bda325fcSPaul Mullowney 1394bda325fcSPaul Mullowney /* restore */ 1395c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1396c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 139705035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1398661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1399958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1400bda325fcSPaul Mullowney PetscFunctionReturn(0); 1401bda325fcSPaul Mullowney } 1402bda325fcSPaul Mullowney 14036fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 14049ae82921SPaul Mullowney { 1405465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1406465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1407465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1408465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 14099ae82921SPaul Mullowney cusparseStatus_t stat; 14109ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1411aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1412aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1413aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1414b175d8bbSPaul Mullowney PetscErrorCode ierr; 141557d48284SJunchao Zhang cudaError_t cerr; 14169ae82921SPaul Mullowney 14179ae82921SPaul Mullowney PetscFunctionBegin; 1418ebc8f436SDominic Meiser 1419e057df02SPaul Mullowney /* Get the GPU pointers */ 1420c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1421c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1422c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1423c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 14249ae82921SPaul Mullowney 14257a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1426aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1427c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1428c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 14294e4bbfaaSStefano Zampini tempGPU->begin()); 1430aa372e3fSPaul Mullowney 1431aa372e3fSPaul Mullowney /* Next, solve L */ 1432aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1433afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 1434*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1435afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1436afb2bd1cSJunchao Zhang #endif 1437afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1438aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1439aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1440aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1441aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1442afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1443*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1444afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1445afb2bd1cSJunchao Zhang #endif 1446afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1447aa372e3fSPaul Mullowney 1448aa372e3fSPaul Mullowney /* Then, solve U */ 1449aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1450afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 1451*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1452afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1453afb2bd1cSJunchao Zhang #endif 1454afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1455aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1456aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1457aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1458aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1459afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 1460*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1461afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1462afb2bd1cSJunchao Zhang #endif 1463afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1464aa372e3fSPaul Mullowney 14654e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 14664e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 14674e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 14684e4bbfaaSStefano Zampini xGPU); 14699ae82921SPaul Mullowney 1470c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1471c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 147205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1473661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1474958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 14759ae82921SPaul Mullowney PetscFunctionReturn(0); 14769ae82921SPaul Mullowney } 14779ae82921SPaul Mullowney 14786fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 14799ae82921SPaul Mullowney { 1480465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1481465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 14829ae82921SPaul Mullowney cusparseStatus_t stat; 14839ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1487b175d8bbSPaul Mullowney PetscErrorCode ierr; 148857d48284SJunchao Zhang cudaError_t cerr; 14899ae82921SPaul Mullowney 14909ae82921SPaul Mullowney PetscFunctionBegin; 1491e057df02SPaul Mullowney /* Get the GPU pointers */ 1492c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1493c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 14949ae82921SPaul Mullowney 14957a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1496aa372e3fSPaul Mullowney /* First, solve L */ 1497aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1498afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 1499*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1500afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1501afb2bd1cSJunchao Zhang #endif 1502afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1503aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1504aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1505aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1506aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1507afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 1508*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1509afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1510afb2bd1cSJunchao Zhang #endif 1511afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1512aa372e3fSPaul Mullowney 1513aa372e3fSPaul Mullowney /* Next, solve U */ 1514aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1515afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 1516*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1517afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1518afb2bd1cSJunchao Zhang #endif 1519afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1520aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1521aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1522aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1523aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1524afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1525*1b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1526afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1527afb2bd1cSJunchao Zhang #endif 1528afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 15299ae82921SPaul Mullowney 1530c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1531c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 153205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1533661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1534958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 15359ae82921SPaul Mullowney PetscFunctionReturn(0); 15369ae82921SPaul Mullowney } 15379ae82921SPaul Mullowney 15387e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 15397e8381f9SStefano Zampini { 15407e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 15417e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 15427e8381f9SStefano Zampini cudaError_t cerr; 15437e8381f9SStefano Zampini PetscErrorCode ierr; 15447e8381f9SStefano Zampini 15457e8381f9SStefano Zampini PetscFunctionBegin; 15467e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 15477e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 15487e8381f9SStefano Zampini 15497e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 15507e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 15517e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 15527e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 15537e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 15547e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 15557e8381f9SStefano Zampini } 15567e8381f9SStefano Zampini PetscFunctionReturn(0); 15577e8381f9SStefano Zampini } 15587e8381f9SStefano Zampini 15597e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 15607e8381f9SStefano Zampini { 15617e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 15627e8381f9SStefano Zampini PetscErrorCode ierr; 15637e8381f9SStefano Zampini 15647e8381f9SStefano Zampini PetscFunctionBegin; 15657e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 15667e8381f9SStefano Zampini *array = a->a; 15677e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 15687e8381f9SStefano Zampini PetscFunctionReturn(0); 15697e8381f9SStefano Zampini } 15707e8381f9SStefano Zampini 15716fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 15729ae82921SPaul Mullowney { 1573aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 15747c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 15759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1576213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 15779ae82921SPaul Mullowney PetscErrorCode ierr; 1578aa372e3fSPaul Mullowney cusparseStatus_t stat; 1579b06137fdSPaul Mullowney cudaError_t err; 15809ae82921SPaul Mullowney 15819ae82921SPaul Mullowney PetscFunctionBegin; 158295639643SRichard Tran Mills if (A->boundtocpu) PetscFunctionReturn(0); 1583c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 158481902715SJunchao Zhang if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 158581902715SJunchao Zhang /* Copy values only */ 1586afb2bd1cSJunchao Zhang CsrMatrix *matrix,*matrixT; 1587afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 158885ba7357SStefano Zampini 158985ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1590afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 159105035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 15924863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 159385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 159481902715SJunchao Zhang 159581902715SJunchao Zhang /* Update matT when it was built before */ 159681902715SJunchao Zhang if (cusparsestruct->matTranspose) { 159781902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1598afb2bd1cSJunchao Zhang matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 159985ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 160081902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1601afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1602afb2bd1cSJunchao Zhang matrix->values->data().get(), 160381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1604afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1605afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1606afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1607afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1608afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1609afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1610afb2bd1cSJunchao Zhang #else 1611afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1612afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1613afb2bd1cSJunchao Zhang #endif 1614afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 161505035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 161685ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 161781902715SJunchao Zhang } 161834d6c7a5SJose E. Roman } else { 161985ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 16207c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 16217c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 16227c700b8dSJunchao Zhang delete cusparsestruct->workVector; 162381902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 16249ae82921SPaul Mullowney try { 16259ae82921SPaul Mullowney if (a->compressedrow.use) { 16269ae82921SPaul Mullowney m = a->compressedrow.nrows; 16279ae82921SPaul Mullowney ii = a->compressedrow.i; 16289ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 16299ae82921SPaul Mullowney } else { 1630213423ffSJunchao Zhang m = A->rmap->n; 1631213423ffSJunchao Zhang ii = a->i; 1632e6e9a74fSStefano Zampini ridx = NULL; 16339ae82921SPaul Mullowney } 1634213423ffSJunchao Zhang cusparsestruct->nrows = m; 16359ae82921SPaul Mullowney 163685ba7357SStefano Zampini /* create cusparse matrix */ 1637aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 163857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 163957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 164057d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 16419ae82921SPaul Mullowney 1642afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 16437656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 16447656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1645afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 16467656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 16477656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 164857d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1649b06137fdSPaul Mullowney 1650aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1651aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1652aa372e3fSPaul Mullowney /* set the matrix */ 1653afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1654afb2bd1cSJunchao Zhang mat->num_rows = m; 1655afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1656afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1657afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1658afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 16599ae82921SPaul Mullowney 1660afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1661afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1662aa372e3fSPaul Mullowney 1663afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1664afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1665aa372e3fSPaul Mullowney 1666aa372e3fSPaul Mullowney /* assign the pointer */ 1667afb2bd1cSJunchao Zhang matstruct->mat = mat; 1668afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1669afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1670afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1671afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1672afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1673afb2bd1cSJunchao Zhang mat->values->data().get(), 1674afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1675afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1676afb2bd1cSJunchao Zhang } 1677afb2bd1cSJunchao Zhang #endif 1678aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1679afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1680afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1681afb2bd1cSJunchao Zhang #else 1682afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1683afb2bd1cSJunchao Zhang mat->num_rows = m; 1684afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1685afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1686afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1687afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1688aa372e3fSPaul Mullowney 1689afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1690afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1691aa372e3fSPaul Mullowney 1692afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1693afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1694aa372e3fSPaul Mullowney 1695aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 169657d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1697aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1698aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1699afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1700afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1701afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1702afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 170357d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1704aa372e3fSPaul Mullowney /* assign the pointer */ 1705aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1706aa372e3fSPaul Mullowney 1707afb2bd1cSJunchao Zhang if (mat) { 1708afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1709afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1710afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1711afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1712087f3262SPaul Mullowney } 1713afb2bd1cSJunchao Zhang #endif 1714087f3262SPaul Mullowney } 1715ca45077fSPaul Mullowney 1716aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1717213423ffSJunchao Zhang if (a->compressedrow.use) { 1718213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1719aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1720aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1721213423ffSJunchao Zhang tmp = m; 1722213423ffSJunchao Zhang } else { 1723213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1724213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1725213423ffSJunchao Zhang tmp = 0; 1726213423ffSJunchao Zhang } 1727213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1728aa372e3fSPaul Mullowney 1729aa372e3fSPaul Mullowney /* assign the pointer */ 1730aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 17319ae82921SPaul Mullowney } catch(char *ex) { 17329ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 17339ae82921SPaul Mullowney } 173405035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 173585ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 173634d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 173734d6c7a5SJose E. Roman } 1738c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 17399ae82921SPaul Mullowney } 17409ae82921SPaul Mullowney PetscFunctionReturn(0); 17419ae82921SPaul Mullowney } 17429ae82921SPaul Mullowney 1743c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1744aa372e3fSPaul Mullowney { 1745aa372e3fSPaul Mullowney template <typename Tuple> 1746aa372e3fSPaul Mullowney __host__ __device__ 1747aa372e3fSPaul Mullowney void operator()(Tuple t) 1748aa372e3fSPaul Mullowney { 1749aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1750aa372e3fSPaul Mullowney } 1751aa372e3fSPaul Mullowney }; 1752aa372e3fSPaul Mullowney 17537e8381f9SStefano Zampini struct VecCUDAEquals 17547e8381f9SStefano Zampini { 17557e8381f9SStefano Zampini template <typename Tuple> 17567e8381f9SStefano Zampini __host__ __device__ 17577e8381f9SStefano Zampini void operator()(Tuple t) 17587e8381f9SStefano Zampini { 17597e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 17607e8381f9SStefano Zampini } 17617e8381f9SStefano Zampini }; 17627e8381f9SStefano Zampini 1763e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1764e6e9a74fSStefano Zampini { 1765e6e9a74fSStefano Zampini template <typename Tuple> 1766e6e9a74fSStefano Zampini __host__ __device__ 1767e6e9a74fSStefano Zampini void operator()(Tuple t) 1768e6e9a74fSStefano Zampini { 1769e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1770e6e9a74fSStefano Zampini } 1771e6e9a74fSStefano Zampini }; 1772e6e9a74fSStefano Zampini 1773afb2bd1cSJunchao Zhang struct MatMatCusparse { 1774ccdfe979SStefano Zampini PetscBool cisdense; 1775ccdfe979SStefano Zampini PetscScalar *Bt; 1776ccdfe979SStefano Zampini Mat X; 1777afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1778afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1779afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1780afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1781afb2bd1cSJunchao Zhang size_t spmmBufferSize; 1782afb2bd1cSJunchao Zhang void *spmmBuffer; 1783afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1784afb2bd1cSJunchao Zhang #endif 1785afb2bd1cSJunchao Zhang }; 1786ccdfe979SStefano Zampini 1787ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1788ccdfe979SStefano Zampini { 1789ccdfe979SStefano Zampini PetscErrorCode ierr; 1790ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1791ccdfe979SStefano Zampini cudaError_t cerr; 1792ccdfe979SStefano Zampini 1793ccdfe979SStefano Zampini PetscFunctionBegin; 1794ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1795afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1796afb2bd1cSJunchao Zhang cusparseStatus_t stat; 1797afb2bd1cSJunchao Zhang if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1798afb2bd1cSJunchao Zhang if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1799afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1800afb2bd1cSJunchao Zhang #endif 1801ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1802ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1803ccdfe979SStefano Zampini PetscFunctionReturn(0); 1804ccdfe979SStefano Zampini } 1805ccdfe979SStefano Zampini 1806ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1807ccdfe979SStefano Zampini 1808ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1809ccdfe979SStefano Zampini { 1810ccdfe979SStefano Zampini Mat_Product *product = C->product; 1811ccdfe979SStefano Zampini Mat A,B; 1812afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1813ccdfe979SStefano Zampini PetscBool flg,biscuda; 1814ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1815ccdfe979SStefano Zampini cusparseStatus_t stat; 1816ccdfe979SStefano Zampini cusparseOperation_t opA; 1817ccdfe979SStefano Zampini const PetscScalar *barray; 1818ccdfe979SStefano Zampini PetscScalar *carray; 1819ccdfe979SStefano Zampini PetscErrorCode ierr; 1820ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1821ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1822ccdfe979SStefano Zampini CsrMatrix *csrmat; 1823afb2bd1cSJunchao Zhang cudaError_t cerr; 1824ccdfe979SStefano Zampini 1825ccdfe979SStefano Zampini PetscFunctionBegin; 1826ccdfe979SStefano Zampini MatCheckProduct(C,1); 1827ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1828ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1829ccdfe979SStefano Zampini A = product->A; 1830ccdfe979SStefano Zampini B = product->B; 1831ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1832ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1833ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1834ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1835ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1836ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1837ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1838ccdfe979SStefano Zampini switch (product->type) { 1839ccdfe979SStefano Zampini case MATPRODUCT_AB: 1840ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1841ccdfe979SStefano Zampini mat = cusp->mat; 1842ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1843ccdfe979SStefano Zampini m = A->rmap->n; 1844ccdfe979SStefano Zampini n = B->cmap->n; 1845ccdfe979SStefano Zampini break; 1846ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1847e6e9a74fSStefano Zampini if (!cusp->transgen) { 1848e6e9a74fSStefano Zampini mat = cusp->mat; 1849e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1850e6e9a74fSStefano Zampini } else { 1851ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1852ccdfe979SStefano Zampini mat = cusp->matTranspose; 1853ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1854e6e9a74fSStefano Zampini } 1855ccdfe979SStefano Zampini m = A->cmap->n; 1856ccdfe979SStefano Zampini n = B->cmap->n; 1857ccdfe979SStefano Zampini break; 1858ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1859ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1860ccdfe979SStefano Zampini mat = cusp->mat; 1861ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1862ccdfe979SStefano Zampini m = A->rmap->n; 1863ccdfe979SStefano Zampini n = B->rmap->n; 1864ccdfe979SStefano Zampini break; 1865ccdfe979SStefano Zampini default: 1866ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1867ccdfe979SStefano Zampini } 1868ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1869ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1870ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1871ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1872afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1873ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1874afb2bd1cSJunchao Zhang 1875ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1876c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1877c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1878c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1879c8378d12SStefano Zampini } else { 1880c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1881c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1882c8378d12SStefano Zampini } 1883c8378d12SStefano Zampini 1884c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1885afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1886afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1887afb2bd1cSJunchao Zhang /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1888afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1889afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1890afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 1891afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1892afb2bd1cSJunchao Zhang mmdata->Blda = blda; 1893afb2bd1cSJunchao Zhang } 1894c8378d12SStefano Zampini 1895afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1896afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1897afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1898afb2bd1cSJunchao Zhang mmdata->Clda = clda; 1899afb2bd1cSJunchao Zhang } 1900afb2bd1cSJunchao Zhang 1901afb2bd1cSJunchao Zhang if (!mat->matDescr) { 1902afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 1903afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1904afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1905afb2bd1cSJunchao Zhang csrmat->values->data().get(), 1906afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1907afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1908afb2bd1cSJunchao Zhang } 1909afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1910afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1911afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 1912afb2bd1cSJunchao Zhang cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1913afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1914afb2bd1cSJunchao Zhang cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1915afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 1916afb2bd1cSJunchao Zhang } else { 1917afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 1918afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1919afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1920afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1921afb2bd1cSJunchao Zhang } 1922afb2bd1cSJunchao Zhang 1923afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 1924afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1925afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1926afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 1927afb2bd1cSJunchao Zhang cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1928afb2bd1cSJunchao Zhang #else 1929afb2bd1cSJunchao Zhang PetscInt k; 1930afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 1931ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1932ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 1933ccdfe979SStefano Zampini cublasStatus_t cerr; 1934ccdfe979SStefano Zampini 1935ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1936ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1937ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 1938ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 1939ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 1940ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1941ccdfe979SStefano Zampini blda = B->cmap->n; 1942afb2bd1cSJunchao Zhang k = B->cmap->n; 1943afb2bd1cSJunchao Zhang } else { 1944afb2bd1cSJunchao Zhang k = B->rmap->n; 1945ccdfe979SStefano Zampini } 1946ccdfe979SStefano Zampini 1947afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1948ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1949afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 1950ccdfe979SStefano Zampini csrmat->values->data().get(), 1951ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 1952ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 1953ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1954ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 1955afb2bd1cSJunchao Zhang #endif 1956afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1957c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1958c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1959ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1960ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 1961ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1962ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1963ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 1964ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1965ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1966ccdfe979SStefano Zampini } else { 1967ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1968ccdfe979SStefano Zampini } 1969ccdfe979SStefano Zampini if (mmdata->cisdense) { 1970ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1971ccdfe979SStefano Zampini } 1972ccdfe979SStefano Zampini if (!biscuda) { 1973ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1974ccdfe979SStefano Zampini } 1975ccdfe979SStefano Zampini PetscFunctionReturn(0); 1976ccdfe979SStefano Zampini } 1977ccdfe979SStefano Zampini 1978ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1979ccdfe979SStefano Zampini { 1980ccdfe979SStefano Zampini Mat_Product *product = C->product; 1981ccdfe979SStefano Zampini Mat A,B; 1982ccdfe979SStefano Zampini PetscInt m,n; 1983ccdfe979SStefano Zampini PetscBool cisdense,flg; 1984ccdfe979SStefano Zampini PetscErrorCode ierr; 1985ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1986ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1987ccdfe979SStefano Zampini 1988ccdfe979SStefano Zampini PetscFunctionBegin; 1989ccdfe979SStefano Zampini MatCheckProduct(C,1); 1990ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1991ccdfe979SStefano Zampini A = product->A; 1992ccdfe979SStefano Zampini B = product->B; 1993ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1994ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1995ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1996ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1997ccdfe979SStefano Zampini switch (product->type) { 1998ccdfe979SStefano Zampini case MATPRODUCT_AB: 1999ccdfe979SStefano Zampini m = A->rmap->n; 2000ccdfe979SStefano Zampini n = B->cmap->n; 2001ccdfe979SStefano Zampini break; 2002ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2003ccdfe979SStefano Zampini m = A->cmap->n; 2004ccdfe979SStefano Zampini n = B->cmap->n; 2005ccdfe979SStefano Zampini break; 2006ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2007ccdfe979SStefano Zampini m = A->rmap->n; 2008ccdfe979SStefano Zampini n = B->rmap->n; 2009ccdfe979SStefano Zampini break; 2010ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2011ccdfe979SStefano Zampini m = B->cmap->n; 2012ccdfe979SStefano Zampini n = B->cmap->n; 2013ccdfe979SStefano Zampini break; 2014ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2015ccdfe979SStefano Zampini m = B->rmap->n; 2016ccdfe979SStefano Zampini n = B->rmap->n; 2017ccdfe979SStefano Zampini break; 2018ccdfe979SStefano Zampini default: 2019ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2020ccdfe979SStefano Zampini } 2021ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2022ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2023ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2024ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2025ccdfe979SStefano Zampini 2026ccdfe979SStefano Zampini /* product data */ 2027ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2028ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2029afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2030afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2031ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2032afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2033ccdfe979SStefano Zampini } 2034afb2bd1cSJunchao Zhang #endif 2035ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2036ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2037ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2038ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2039ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2040ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2041ccdfe979SStefano Zampini } else { 2042ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2043ccdfe979SStefano Zampini } 2044ccdfe979SStefano Zampini } 2045ccdfe979SStefano Zampini C->product->data = mmdata; 2046ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2047ccdfe979SStefano Zampini 2048ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2049ccdfe979SStefano Zampini PetscFunctionReturn(0); 2050ccdfe979SStefano Zampini } 2051ccdfe979SStefano Zampini 2052ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2053ccdfe979SStefano Zampini 2054ccdfe979SStefano Zampini /* handles dense B */ 2055ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2056ccdfe979SStefano Zampini { 2057ccdfe979SStefano Zampini Mat_Product *product = C->product; 2058ccdfe979SStefano Zampini PetscErrorCode ierr; 2059ccdfe979SStefano Zampini 2060ccdfe979SStefano Zampini PetscFunctionBegin; 2061ccdfe979SStefano Zampini MatCheckProduct(C,1); 2062ccdfe979SStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2063ccdfe979SStefano Zampini if (product->A->boundtocpu) { 2064ccdfe979SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2065ccdfe979SStefano Zampini PetscFunctionReturn(0); 2066ccdfe979SStefano Zampini } 2067ccdfe979SStefano Zampini switch (product->type) { 2068ccdfe979SStefano Zampini case MATPRODUCT_AB: 2069ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2070ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2071ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2072ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2073ccdfe979SStefano Zampini C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2074ccdfe979SStefano Zampini default: 2075ccdfe979SStefano Zampini break; 2076ccdfe979SStefano Zampini } 2077ccdfe979SStefano Zampini PetscFunctionReturn(0); 2078ccdfe979SStefano Zampini } 2079ccdfe979SStefano Zampini 20806fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 20819ae82921SPaul Mullowney { 2082b175d8bbSPaul Mullowney PetscErrorCode ierr; 20839ae82921SPaul Mullowney 20849ae82921SPaul Mullowney PetscFunctionBegin; 2085e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2086e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2087e6e9a74fSStefano Zampini } 2088e6e9a74fSStefano Zampini 2089e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2090e6e9a74fSStefano Zampini { 2091e6e9a74fSStefano Zampini PetscErrorCode ierr; 2092e6e9a74fSStefano Zampini 2093e6e9a74fSStefano Zampini PetscFunctionBegin; 2094e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2095e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2096e6e9a74fSStefano Zampini } 2097e6e9a74fSStefano Zampini 2098e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2099e6e9a74fSStefano Zampini { 2100e6e9a74fSStefano Zampini PetscErrorCode ierr; 2101e6e9a74fSStefano Zampini 2102e6e9a74fSStefano Zampini PetscFunctionBegin; 2103e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2104e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2105e6e9a74fSStefano Zampini } 2106e6e9a74fSStefano Zampini 2107e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2108e6e9a74fSStefano Zampini { 2109e6e9a74fSStefano Zampini PetscErrorCode ierr; 2110e6e9a74fSStefano Zampini 2111e6e9a74fSStefano Zampini PetscFunctionBegin; 2112e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 21139ae82921SPaul Mullowney PetscFunctionReturn(0); 21149ae82921SPaul Mullowney } 21159ae82921SPaul Mullowney 21166fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2117ca45077fSPaul Mullowney { 2118b175d8bbSPaul Mullowney PetscErrorCode ierr; 2119ca45077fSPaul Mullowney 2120ca45077fSPaul Mullowney PetscFunctionBegin; 2121e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2122ca45077fSPaul Mullowney PetscFunctionReturn(0); 2123ca45077fSPaul Mullowney } 2124ca45077fSPaul Mullowney 2125afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2126e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 21279ae82921SPaul Mullowney { 21289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2129aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 21309ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2131e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2132b175d8bbSPaul Mullowney PetscErrorCode ierr; 213357d48284SJunchao Zhang cudaError_t cerr; 2134aa372e3fSPaul Mullowney cusparseStatus_t stat; 2135e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2136e6e9a74fSStefano Zampini PetscBool compressed; 2137afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2138afb2bd1cSJunchao Zhang PetscInt nx,ny; 2139afb2bd1cSJunchao Zhang #endif 21406e111a19SKarl Rupp 21419ae82921SPaul Mullowney PetscFunctionBegin; 2142e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2143e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2144afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2145d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2146e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2147e6e9a74fSStefano Zampini } 214834d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 214934d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2150e6e9a74fSStefano Zampini if (!trans) { 21519ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2152c9567895SMark if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2153e6e9a74fSStefano Zampini } else { 2154e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 2155e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2156e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2157e6e9a74fSStefano Zampini } else { 2158afb2bd1cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2159e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2160e6e9a74fSStefano Zampini } 2161e6e9a74fSStefano Zampini } 2162e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2163e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2164213423ffSJunchao Zhang 2165e6e9a74fSStefano Zampini try { 2166e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2167213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2168213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2169afb2bd1cSJunchao Zhang 217085ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2171e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2172afb2bd1cSJunchao Zhang /* z = A x + beta y. 2173afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2174afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2175afb2bd1cSJunchao Zhang */ 2176e6e9a74fSStefano Zampini xptr = xarray; 2177afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2178213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2180afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2181afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2182afb2bd1cSJunchao Zhang */ 2183afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2184afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2185afb2bd1cSJunchao Zhang nx = mat->num_cols; 2186afb2bd1cSJunchao Zhang ny = mat->num_rows; 2187afb2bd1cSJunchao Zhang } 2188afb2bd1cSJunchao Zhang #endif 2189e6e9a74fSStefano Zampini } else { 2190afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2191afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2192afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2193afb2bd1cSJunchao Zhang */ 2194afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2195e6e9a74fSStefano Zampini dptr = zarray; 2196e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2197afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2198e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2199e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2200e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2201e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2202e6e9a74fSStefano Zampini } 2203afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2204afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2205afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2206afb2bd1cSJunchao Zhang nx = mat->num_rows; 2207afb2bd1cSJunchao Zhang ny = mat->num_cols; 2208afb2bd1cSJunchao Zhang } 2209afb2bd1cSJunchao Zhang #endif 2210e6e9a74fSStefano Zampini } 22119ae82921SPaul Mullowney 2212afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2213aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2214afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2215afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2216afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2217afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2218afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2219afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2220afb2bd1cSJunchao Zhang matstruct->matDescr, 2221afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2222afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2223afb2bd1cSJunchao Zhang cusparse_scalartype, 2224afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2225afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2226afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2227afb2bd1cSJunchao Zhang 2228afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2229afb2bd1cSJunchao Zhang } else { 2230afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2231afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2232afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2233afb2bd1cSJunchao Zhang } 2234afb2bd1cSJunchao Zhang 2235afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2236afb2bd1cSJunchao Zhang matstruct->alpha_one, 2237afb2bd1cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2238afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2239afb2bd1cSJunchao Zhang beta, 2240afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2241afb2bd1cSJunchao Zhang cusparse_scalartype, 2242afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2243afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2244afb2bd1cSJunchao Zhang #else 22457656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2246e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2247a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2248afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2249aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2250e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 225157d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2252afb2bd1cSJunchao Zhang #endif 2253aa372e3fSPaul Mullowney } else { 2254213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2255afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2256afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2257afb2bd1cSJunchao Zhang #else 2258301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2259e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2260afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2261e6e9a74fSStefano Zampini xptr, beta, 226257d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2263afb2bd1cSJunchao Zhang #endif 2264a65300a6SPaul Mullowney } 2265aa372e3fSPaul Mullowney } 226605035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2267958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2268aa372e3fSPaul Mullowney 2269e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2270213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2271213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2272213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2273e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2274213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 22757656d835SStefano Zampini } 2276213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2277c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 22787656d835SStefano Zampini } 22797656d835SStefano Zampini 2280213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2281213423ffSJunchao Zhang if (compressed) { 2282213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2283e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2284c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2285e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2286c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 228705035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2288958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2289e6e9a74fSStefano Zampini } 2290e6e9a74fSStefano Zampini } else { 2291e6e9a74fSStefano Zampini if (yy && yy != zz) { 2292e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2293e6e9a74fSStefano Zampini } 2294e6e9a74fSStefano Zampini } 2295e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2296213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2297213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 22989ae82921SPaul Mullowney } catch(char *ex) { 22999ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 23009ae82921SPaul Mullowney } 2301e6e9a74fSStefano Zampini if (yy) { 2302958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2303e6e9a74fSStefano Zampini } else { 2304e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2305e6e9a74fSStefano Zampini } 23069ae82921SPaul Mullowney PetscFunctionReturn(0); 23079ae82921SPaul Mullowney } 23089ae82921SPaul Mullowney 23096fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2310ca45077fSPaul Mullowney { 2311b175d8bbSPaul Mullowney PetscErrorCode ierr; 23126e111a19SKarl Rupp 2313ca45077fSPaul Mullowney PetscFunctionBegin; 2314e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2315ca45077fSPaul Mullowney PetscFunctionReturn(0); 2316ca45077fSPaul Mullowney } 2317ca45077fSPaul Mullowney 23186fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 23199ae82921SPaul Mullowney { 23209ae82921SPaul Mullowney PetscErrorCode ierr; 23213fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 23223fa6b06aSMark Adams PetscBool is_seq = PETSC_TRUE; 23233fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 23249ae82921SPaul Mullowney PetscFunctionBegin; 2325bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 23263fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2327bc3f50f2SPaul Mullowney } 23283fa6b06aSMark Adams if (d_mat) { 23293fa6b06aSMark Adams cudaError_t err; 23303fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 23313fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 23323fa6b06aSMark Adams nnz_state = h_mat.nonzerostate; 23333fa6b06aSMark Adams is_seq = h_mat.seq; 23343fa6b06aSMark Adams } 23353fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 23363fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 23373fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 23383fa6b06aSMark Adams ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 23393fa6b06aSMark Adams } else if (nnz_state > A->nonzerostate) { 23403fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 23413fa6b06aSMark Adams } 23423fa6b06aSMark Adams 23439ae82921SPaul Mullowney PetscFunctionReturn(0); 23449ae82921SPaul Mullowney } 23459ae82921SPaul Mullowney 23469ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2347e057df02SPaul Mullowney /*@ 23489ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2349e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2350e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2351e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2352e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2353e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 23549ae82921SPaul Mullowney 2355d083f849SBarry Smith Collective 23569ae82921SPaul Mullowney 23579ae82921SPaul Mullowney Input Parameters: 23589ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 23599ae82921SPaul Mullowney . m - number of rows 23609ae82921SPaul Mullowney . n - number of columns 23619ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 23629ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 23630298fd71SBarry Smith (possibly different for each row) or NULL 23649ae82921SPaul Mullowney 23659ae82921SPaul Mullowney Output Parameter: 23669ae82921SPaul Mullowney . A - the matrix 23679ae82921SPaul Mullowney 23689ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 23699ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 23709ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 23719ae82921SPaul Mullowney 23729ae82921SPaul Mullowney Notes: 23739ae82921SPaul Mullowney If nnz is given then nz is ignored 23749ae82921SPaul Mullowney 23759ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 23769ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 23779ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 23789ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 23799ae82921SPaul Mullowney 23809ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 23810298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 23829ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 23839ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 23849ae82921SPaul Mullowney 23859ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 23869ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 23879ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 23889ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 23899ae82921SPaul Mullowney 23909ae82921SPaul Mullowney Level: intermediate 23919ae82921SPaul Mullowney 2392e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 23939ae82921SPaul Mullowney @*/ 23949ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 23959ae82921SPaul Mullowney { 23969ae82921SPaul Mullowney PetscErrorCode ierr; 23979ae82921SPaul Mullowney 23989ae82921SPaul Mullowney PetscFunctionBegin; 23999ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 24009ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 24019ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 24029ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 24039ae82921SPaul Mullowney PetscFunctionReturn(0); 24049ae82921SPaul Mullowney } 24059ae82921SPaul Mullowney 24066fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 24079ae82921SPaul Mullowney { 24089ae82921SPaul Mullowney PetscErrorCode ierr; 24093fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2410ab25e6cbSDominic Meiser 24119ae82921SPaul Mullowney PetscFunctionBegin; 24129ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 24133fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 24143fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2415470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 24169ae82921SPaul Mullowney } else { 2417470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2418aa372e3fSPaul Mullowney } 24193fa6b06aSMark Adams if (d_mat) { 24203fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 24213fa6b06aSMark Adams cudaError_t err; 24223fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 24233fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 24243fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 24253fa6b06aSMark Adams if (h_mat.seq) { 24263fa6b06aSMark Adams if (a->compressedrow.use) { 24273fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 24283fa6b06aSMark Adams } 24293fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 24303fa6b06aSMark Adams } 24313fa6b06aSMark Adams } 2432ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2433ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2434ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2435ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 24367e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 24377e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 24389ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 24399ae82921SPaul Mullowney PetscFunctionReturn(0); 24409ae82921SPaul Mullowney } 24419ae82921SPaul Mullowney 2442ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 244395639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 24449ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 24459ff858a8SKarl Rupp { 24469ff858a8SKarl Rupp PetscErrorCode ierr; 24479ff858a8SKarl Rupp 24489ff858a8SKarl Rupp PetscFunctionBegin; 24499ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2450ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 24519ff858a8SKarl Rupp PetscFunctionReturn(0); 24529ff858a8SKarl Rupp } 24539ff858a8SKarl Rupp 245495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 245595639643SRichard Tran Mills { 2456c58ef05eSStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2457e6e9a74fSStefano Zampini PetscErrorCode ierr; 2458e6e9a74fSStefano Zampini 245995639643SRichard Tran Mills PetscFunctionBegin; 2460e6e9a74fSStefano Zampini if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 246195639643SRichard Tran Mills if (flg) { 24627e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 24637e8381f9SStefano Zampini 246495639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJ; 246595639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJ; 2466c34f1ff0SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2467c34f1ff0SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2468e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = NULL; 2469e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = NULL; 2470e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2471e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 24727e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 24737e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 24747e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 247595639643SRichard Tran Mills } else { 247695639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJCUSPARSE; 247795639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 247895639643SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 247995639643SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2480e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2481e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2482e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2483e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 24847e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 24857e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 24867e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 248795639643SRichard Tran Mills } 248895639643SRichard Tran Mills A->boundtocpu = flg; 2489c58ef05eSStefano Zampini a->inode.use = flg; 249095639643SRichard Tran Mills PetscFunctionReturn(0); 249195639643SRichard Tran Mills } 249295639643SRichard Tran Mills 24933fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 24943fa6b06aSMark Adams { 24953fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 24963fa6b06aSMark Adams PetscErrorCode ierr; 24977e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 24987e8381f9SStefano Zampini 24993fa6b06aSMark Adams PetscFunctionBegin; 25003fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 25013fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 25027e8381f9SStefano Zampini if (spptr->mat) { 25037e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 25047e8381f9SStefano Zampini if (matrix->values) { 25057e8381f9SStefano Zampini both = PETSC_TRUE; 25067e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 25077e8381f9SStefano Zampini } 25087e8381f9SStefano Zampini } 25097e8381f9SStefano Zampini if (spptr->matTranspose) { 25107e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 25117e8381f9SStefano Zampini if (matrix->values) { 25127e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 25137e8381f9SStefano Zampini } 25147e8381f9SStefano Zampini } 25153fa6b06aSMark Adams d_mat = spptr->deviceMat; 25163fa6b06aSMark Adams } 25173fa6b06aSMark Adams if (d_mat) { 25183fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 25193fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz = a->i[n]; 25203fa6b06aSMark Adams cudaError_t err; 25213fa6b06aSMark Adams PetscScalar *vals; 25223fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 25233fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 25243fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 25253fa6b06aSMark Adams } 25263fa6b06aSMark Adams ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 25277e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 25283fa6b06aSMark Adams 25293fa6b06aSMark Adams PetscFunctionReturn(0); 25303fa6b06aSMark Adams } 25313fa6b06aSMark Adams 253249735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 25339ae82921SPaul Mullowney { 25349ae82921SPaul Mullowney PetscErrorCode ierr; 2535aa372e3fSPaul Mullowney cusparseStatus_t stat; 253649735bf3SStefano Zampini Mat B; 25379ae82921SPaul Mullowney 25389ae82921SPaul Mullowney PetscFunctionBegin; 253949735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 254049735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 254149735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 254249735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 254349735bf3SStefano Zampini } 254449735bf3SStefano Zampini B = *newmat; 254549735bf3SStefano Zampini 254634136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 254734136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 254834136279SStefano Zampini 254949735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 25509ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 2551e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 2552e6e9a74fSStefano Zampini 2553e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2554e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 2555e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2556e6e9a74fSStefano Zampini B->spptr = spptr; 25573fa6b06aSMark Adams spptr->deviceMat = NULL; 25589ae82921SPaul Mullowney } else { 2559e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 2560e6e9a74fSStefano Zampini 2561e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2562e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2563e6e9a74fSStefano Zampini B->spptr = spptr; 25649ae82921SPaul Mullowney } 2565e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 256649735bf3SStefano Zampini } 2567693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 25689ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 25699ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 257095639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2571693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 25723fa6b06aSMark Adams B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 25732205254eSKarl Rupp 2574e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 25759ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2576bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 25779ae82921SPaul Mullowney PetscFunctionReturn(0); 25789ae82921SPaul Mullowney } 25799ae82921SPaul Mullowney 258002fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 258102fe1965SBarry Smith { 258202fe1965SBarry Smith PetscErrorCode ierr; 258302fe1965SBarry Smith 258402fe1965SBarry Smith PetscFunctionBegin; 258505035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 258602fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 25870ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2588afb2bd1cSJunchao Zhang ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2589afb2bd1cSJunchao Zhang ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2590afb2bd1cSJunchao Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 259102fe1965SBarry Smith PetscFunctionReturn(0); 259202fe1965SBarry Smith } 259302fe1965SBarry Smith 25943ca39a21SBarry Smith /*MC 2595e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2596e057df02SPaul Mullowney 2597e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 25982692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 25992692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2600e057df02SPaul Mullowney 2601e057df02SPaul Mullowney Options Database Keys: 2602e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2603aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2604a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2605e057df02SPaul Mullowney 2606e057df02SPaul Mullowney Level: beginner 2607e057df02SPaul Mullowney 26088468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2609e057df02SPaul Mullowney M*/ 26107f756511SDominic Meiser 261142c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 261242c9c57cSBarry Smith 26130f39cd5aSBarry Smith 26143ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 261542c9c57cSBarry Smith { 261642c9c57cSBarry Smith PetscErrorCode ierr; 261742c9c57cSBarry Smith 261842c9c57cSBarry Smith PetscFunctionBegin; 26193ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 26203ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 26213ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 26223ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 262342c9c57cSBarry Smith PetscFunctionReturn(0); 262442c9c57cSBarry Smith } 262529b38603SBarry Smith 2626470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 26277f756511SDominic Meiser { 2628e6e9a74fSStefano Zampini PetscErrorCode ierr; 26297f756511SDominic Meiser cusparseStatus_t stat; 26307f756511SDominic Meiser 26317f756511SDominic Meiser PetscFunctionBegin; 26327f756511SDominic Meiser if (*cusparsestruct) { 2633e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2634e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 26357f756511SDominic Meiser delete (*cusparsestruct)->workVector; 263681902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 26377e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 26387e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 26397e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_v; 26407e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_w; 26417e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2642afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2643afb2bd1cSJunchao Zhang cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2644afb2bd1cSJunchao Zhang #endif 2645e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 26467f756511SDominic Meiser } 26477f756511SDominic Meiser PetscFunctionReturn(0); 26487f756511SDominic Meiser } 26497f756511SDominic Meiser 26507f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 26517f756511SDominic Meiser { 26527f756511SDominic Meiser PetscFunctionBegin; 26537f756511SDominic Meiser if (*mat) { 26547f756511SDominic Meiser delete (*mat)->values; 26557f756511SDominic Meiser delete (*mat)->column_indices; 26567f756511SDominic Meiser delete (*mat)->row_offsets; 26577f756511SDominic Meiser delete *mat; 26587f756511SDominic Meiser *mat = 0; 26597f756511SDominic Meiser } 26607f756511SDominic Meiser PetscFunctionReturn(0); 26617f756511SDominic Meiser } 26627f756511SDominic Meiser 2663470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 26647f756511SDominic Meiser { 26657f756511SDominic Meiser cusparseStatus_t stat; 26667f756511SDominic Meiser PetscErrorCode ierr; 26677f756511SDominic Meiser 26687f756511SDominic Meiser PetscFunctionBegin; 26697f756511SDominic Meiser if (*trifactor) { 267057d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2671afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 26727f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2673*1b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2674afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2675*1b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2676afb2bd1cSJunchao Zhang #endif 26777f756511SDominic Meiser delete *trifactor; 26787e8381f9SStefano Zampini *trifactor = NULL; 26797f756511SDominic Meiser } 26807f756511SDominic Meiser PetscFunctionReturn(0); 26817f756511SDominic Meiser } 26827f756511SDominic Meiser 2683470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 26847f756511SDominic Meiser { 26857f756511SDominic Meiser CsrMatrix *mat; 26867f756511SDominic Meiser cusparseStatus_t stat; 26877f756511SDominic Meiser cudaError_t err; 26887f756511SDominic Meiser 26897f756511SDominic Meiser PetscFunctionBegin; 26907f756511SDominic Meiser if (*matstruct) { 26917f756511SDominic Meiser if ((*matstruct)->mat) { 26927f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2693afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2694afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2695afb2bd1cSJunchao Zhang #else 26967f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 269757d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2698afb2bd1cSJunchao Zhang #endif 26997f756511SDominic Meiser } else { 27007f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 27017f756511SDominic Meiser CsrMatrix_Destroy(&mat); 27027f756511SDominic Meiser } 27037f756511SDominic Meiser } 270457d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 27057f756511SDominic Meiser delete (*matstruct)->cprowIndices; 2706afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 27077656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 27087656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2709afb2bd1cSJunchao Zhang 2710afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2711afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2712afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2713afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 2714afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 2715afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2716afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2717afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2718afb2bd1cSJunchao Zhang } 2719afb2bd1cSJunchao Zhang } 2720afb2bd1cSJunchao Zhang #endif 27217f756511SDominic Meiser delete *matstruct; 27227e8381f9SStefano Zampini *matstruct = NULL; 27237f756511SDominic Meiser } 27247f756511SDominic Meiser PetscFunctionReturn(0); 27257f756511SDominic Meiser } 27267f756511SDominic Meiser 2727ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 27287f756511SDominic Meiser { 2729e6e9a74fSStefano Zampini PetscErrorCode ierr; 2730e6e9a74fSStefano Zampini 27317f756511SDominic Meiser PetscFunctionBegin; 27327f756511SDominic Meiser if (*trifactors) { 2733e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2734e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2735e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2736e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 27377f756511SDominic Meiser delete (*trifactors)->rpermIndices; 27387f756511SDominic Meiser delete (*trifactors)->cpermIndices; 27397f756511SDominic Meiser delete (*trifactors)->workVector; 27407e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 27417e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 27427e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 2743ccdfe979SStefano Zampini } 2744ccdfe979SStefano Zampini PetscFunctionReturn(0); 2745ccdfe979SStefano Zampini } 2746ccdfe979SStefano Zampini 2747ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2748ccdfe979SStefano Zampini { 2749e6e9a74fSStefano Zampini PetscErrorCode ierr; 2750ccdfe979SStefano Zampini cusparseHandle_t handle; 2751ccdfe979SStefano Zampini cusparseStatus_t stat; 2752ccdfe979SStefano Zampini 2753ccdfe979SStefano Zampini PetscFunctionBegin; 2754ccdfe979SStefano Zampini if (*trifactors) { 2755e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 27567f756511SDominic Meiser if (handle = (*trifactors)->handle) { 275757d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 27587f756511SDominic Meiser } 2759e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 27607f756511SDominic Meiser } 27617f756511SDominic Meiser PetscFunctionReturn(0); 27627f756511SDominic Meiser } 27637e8381f9SStefano Zampini 27647e8381f9SStefano Zampini struct IJCompare 27657e8381f9SStefano Zampini { 27667e8381f9SStefano Zampini __host__ __device__ 27677e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 27687e8381f9SStefano Zampini { 27697e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 27707e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 27717e8381f9SStefano Zampini return false; 27727e8381f9SStefano Zampini } 27737e8381f9SStefano Zampini }; 27747e8381f9SStefano Zampini 27757e8381f9SStefano Zampini struct IJEqual 27767e8381f9SStefano Zampini { 27777e8381f9SStefano Zampini __host__ __device__ 27787e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 27797e8381f9SStefano Zampini { 27807e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 27817e8381f9SStefano Zampini return true; 27827e8381f9SStefano Zampini } 27837e8381f9SStefano Zampini }; 27847e8381f9SStefano Zampini 27857e8381f9SStefano Zampini struct IJDiff 27867e8381f9SStefano Zampini { 27877e8381f9SStefano Zampini __host__ __device__ 27887e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 27897e8381f9SStefano Zampini { 27907e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 27917e8381f9SStefano Zampini } 27927e8381f9SStefano Zampini }; 27937e8381f9SStefano Zampini 27947e8381f9SStefano Zampini struct IJSum 27957e8381f9SStefano Zampini { 27967e8381f9SStefano Zampini __host__ __device__ 27977e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 27987e8381f9SStefano Zampini { 27997e8381f9SStefano Zampini return t1||t2; 28007e8381f9SStefano Zampini } 28017e8381f9SStefano Zampini }; 28027e8381f9SStefano Zampini 28037e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 28047e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 28057e8381f9SStefano Zampini { 28067e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 28077e8381f9SStefano Zampini CsrMatrix *matrix; 28087e8381f9SStefano Zampini PetscErrorCode ierr; 28097e8381f9SStefano Zampini cudaError_t cerr; 28107e8381f9SStefano Zampini PetscInt n; 28117e8381f9SStefano Zampini 28127e8381f9SStefano Zampini PetscFunctionBegin; 28137e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 28147e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 28157e8381f9SStefano Zampini if (!cusp->cooPerm) { 28167e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 28177e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 28187e8381f9SStefano Zampini PetscFunctionReturn(0); 28197e8381f9SStefano Zampini } 28207e8381f9SStefano Zampini n = cusp->cooPerm->size(); 28217e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 28227e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 28237e8381f9SStefano Zampini if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 28247e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 28257e8381f9SStefano Zampini if (v) { 28267e8381f9SStefano Zampini cusp->cooPerm_v->assign(v,v+n); 28277e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 28287e8381f9SStefano Zampini } 28297e8381f9SStefano Zampini else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 28307e8381f9SStefano Zampini if (imode == ADD_VALUES) { 28317e8381f9SStefano Zampini if (cusp->cooPerm_a) { 28327e8381f9SStefano Zampini if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 28337e8381f9SStefano Zampini auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 28347e8381f9SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 28357e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 28367e8381f9SStefano Zampini } else { 28377e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 28387e8381f9SStefano Zampini matrix->values->begin())); 28397e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 28407e8381f9SStefano Zampini matrix->values->end())); 28417e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 28427e8381f9SStefano Zampini } 28437e8381f9SStefano Zampini } else { 28447e8381f9SStefano Zampini if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 28457e8381f9SStefano Zampini if we are inserting two different values into the same location */ 28467e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 28477e8381f9SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 28487e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 28497e8381f9SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 28507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 28517e8381f9SStefano Zampini } else { 28527e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 28537e8381f9SStefano Zampini matrix->values->begin())); 28547e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 28557e8381f9SStefano Zampini matrix->values->end())); 28567e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 28577e8381f9SStefano Zampini } 28587e8381f9SStefano Zampini } 28597e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 28607e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 28617e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 28627e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 28637e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 28647e8381f9SStefano Zampini /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 28657e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 28667e8381f9SStefano Zampini PetscFunctionReturn(0); 28677e8381f9SStefano Zampini } 28687e8381f9SStefano Zampini 28697e8381f9SStefano Zampini #include <thrust/binary_search.h> 28707e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 28717e8381f9SStefano Zampini { 28727e8381f9SStefano Zampini PetscErrorCode ierr; 28737e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 28747e8381f9SStefano Zampini CsrMatrix *matrix; 28757e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 28767e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 28777e8381f9SStefano Zampini cudaError_t cerr; 28787e8381f9SStefano Zampini 28797e8381f9SStefano Zampini PetscFunctionBegin; 28807e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 28817e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 28827e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 28837e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 28847e8381f9SStefano Zampini if (n != cooPerm_n) { 28857e8381f9SStefano Zampini delete cusp->cooPerm; 28867e8381f9SStefano Zampini delete cusp->cooPerm_v; 28877e8381f9SStefano Zampini delete cusp->cooPerm_w; 28887e8381f9SStefano Zampini delete cusp->cooPerm_a; 28897e8381f9SStefano Zampini cusp->cooPerm = NULL; 28907e8381f9SStefano Zampini cusp->cooPerm_v = NULL; 28917e8381f9SStefano Zampini cusp->cooPerm_w = NULL; 28927e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 28937e8381f9SStefano Zampini } 28947e8381f9SStefano Zampini if (n) { 28957e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 28967e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 28977e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 28987e8381f9SStefano Zampini 28997e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 29007e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 29017e8381f9SStefano Zampini 29027e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 29037e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 29047e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 29057e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 29067e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 29077e8381f9SStefano Zampini 29087e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 29097e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 29107e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 29117e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 29127e8381f9SStefano Zampini 29137e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 29147e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 29157e8381f9SStefano Zampini delete cusp->cooPerm_a; 29167e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 29177e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 29187e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 29197e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 29207e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 29217e8381f9SStefano Zampini w[0] = 0; 29227e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 29237e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 29247e8381f9SStefano Zampini } 29257e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 29267e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 29277e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 29287e8381f9SStefano Zampini ii.begin()); 29297e8381f9SStefano Zampini 29307e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 29317e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 29327e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 29337e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 29347e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 29357e8381f9SStefano Zampini a->i[0] = 0; 29367e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 29377e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 29387e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 29397e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 29407e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 29417e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 29427e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 29437e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 29447e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 29457e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 29467e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 29477e8381f9SStefano Zampini } 29487e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 29497e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 29507e8381f9SStefano Zampini } else { 29517e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 29527e8381f9SStefano Zampini } 29537e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 29547e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 29557e8381f9SStefano Zampini 29567e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 29577e8381f9SStefano Zampini The code is so convoluted now that I prefer to copy garbage to the GPU */ 29587e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 29597e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 29607e8381f9SStefano Zampini A->nonzerostate++; 29617e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 29627e8381f9SStefano Zampini { 29637e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 29647e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 29657e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 29667e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 29677e8381f9SStefano Zampini } 29687e8381f9SStefano Zampini 29697e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 29707e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 29717e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 29727e8381f9SStefano Zampini PetscFunctionReturn(0); 29737e8381f9SStefano Zampini } 2974