19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 749ae82921SPaul Mullowney 757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 817f756511SDominic Meiser 827e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 847e8381f9SStefano Zampini 85b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86b06137fdSPaul Mullowney { 87b06137fdSPaul Mullowney cusparseStatus_t stat; 88b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89b06137fdSPaul Mullowney 90b06137fdSPaul Mullowney PetscFunctionBegin; 91d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92b06137fdSPaul Mullowney cusparsestruct->stream = stream; 9357d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94b06137fdSPaul Mullowney PetscFunctionReturn(0); 95b06137fdSPaul Mullowney } 96b06137fdSPaul Mullowney 97b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98b06137fdSPaul Mullowney { 99b06137fdSPaul Mullowney cusparseStatus_t stat; 100b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101b06137fdSPaul Mullowney 102b06137fdSPaul Mullowney PetscFunctionBegin; 103d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1046b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 10516a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 10657d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 10716a2e217SAlejandro Lamas Daviña } 108b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1096b1cf21dSAlejandro Lamas Daviña } 11057d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111b06137fdSPaul Mullowney PetscFunctionReturn(0); 112b06137fdSPaul Mullowney } 113b06137fdSPaul Mullowney 114b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115b06137fdSPaul Mullowney { 116b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1177e8381f9SStefano Zampini PetscBool flg; 1187e8381f9SStefano Zampini PetscErrorCode ierr; 119ccdfe979SStefano Zampini 120b06137fdSPaul Mullowney PetscFunctionBegin; 1217e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1227e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 124b06137fdSPaul Mullowney PetscFunctionReturn(0); 125b06137fdSPaul Mullowney } 126b06137fdSPaul Mullowney 127ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1289ae82921SPaul Mullowney { 1299ae82921SPaul Mullowney PetscFunctionBegin; 1309ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1319ae82921SPaul Mullowney PetscFunctionReturn(0); 1329ae82921SPaul Mullowney } 1339ae82921SPaul Mullowney 134c708e6cdSJed Brown /*MC 135087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 141c708e6cdSJed Brown 1429ae82921SPaul Mullowney Level: beginner 143c708e6cdSJed Brown 1443ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145c708e6cdSJed Brown M*/ 1469ae82921SPaul Mullowney 14742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1489ae82921SPaul Mullowney { 1499ae82921SPaul Mullowney PetscErrorCode ierr; 150bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1519ae82921SPaul Mullowney 1529ae82921SPaul Mullowney PetscFunctionBegin; 153bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1552c7c0729SBarry Smith (*B)->factortype = ftype; 1562c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1579ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1582205254eSKarl Rupp 159087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16033d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1619ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1629ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1669ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167bc3f50f2SPaul Mullowney 168fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1693ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1709ae82921SPaul Mullowney PetscFunctionReturn(0); 1719ae82921SPaul Mullowney } 1729ae82921SPaul Mullowney 173bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174ca45077fSPaul Mullowney { 175aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1766e111a19SKarl Rupp 177ca45077fSPaul Mullowney PetscFunctionBegin; 178ca45077fSPaul Mullowney switch (op) { 179e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 180aa372e3fSPaul Mullowney cusparsestruct->format = format; 181ca45077fSPaul Mullowney break; 182e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 183aa372e3fSPaul Mullowney cusparsestruct->format = format; 184ca45077fSPaul Mullowney break; 185ca45077fSPaul Mullowney default: 18636d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187ca45077fSPaul Mullowney } 188ca45077fSPaul Mullowney PetscFunctionReturn(0); 189ca45077fSPaul Mullowney } 1909ae82921SPaul Mullowney 191e057df02SPaul Mullowney /*@ 192e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 194aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 195e057df02SPaul Mullowney Not Collective 196e057df02SPaul Mullowney 197e057df02SPaul Mullowney Input Parameters: 1988468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 19936d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2002692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201e057df02SPaul Mullowney 202e057df02SPaul Mullowney Output Parameter: 203e057df02SPaul Mullowney 204e057df02SPaul Mullowney Level: intermediate 205e057df02SPaul Mullowney 2068468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207e057df02SPaul Mullowney @*/ 208e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209e057df02SPaul Mullowney { 210e057df02SPaul Mullowney PetscErrorCode ierr; 2116e111a19SKarl Rupp 212e057df02SPaul Mullowney PetscFunctionBegin; 213e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215e057df02SPaul Mullowney PetscFunctionReturn(0); 216e057df02SPaul Mullowney } 217e057df02SPaul Mullowney 218e6e9a74fSStefano Zampini /*@ 219e6e9a74fSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220e6e9a74fSStefano Zampini 221e6e9a74fSStefano Zampini Collective on mat 222e6e9a74fSStefano Zampini 223e6e9a74fSStefano Zampini Input Parameters: 224e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 225e6e9a74fSStefano Zampini - transgen - the boolean flag 226e6e9a74fSStefano Zampini 227e6e9a74fSStefano Zampini Level: intermediate 228e6e9a74fSStefano Zampini 229e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE 230e6e9a74fSStefano Zampini @*/ 231e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232e6e9a74fSStefano Zampini { 233e6e9a74fSStefano Zampini PetscErrorCode ierr; 234e6e9a74fSStefano Zampini PetscBool flg; 235e6e9a74fSStefano Zampini 236e6e9a74fSStefano Zampini PetscFunctionBegin; 237e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239e6e9a74fSStefano Zampini if (flg) { 240e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 24154da937aSStefano Zampini 242e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243e6e9a74fSStefano Zampini cusp->transgen = transgen; 24454da937aSStefano Zampini if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 24554da937aSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 24654da937aSStefano Zampini } 247e6e9a74fSStefano Zampini } 248e6e9a74fSStefano Zampini PetscFunctionReturn(0); 249e6e9a74fSStefano Zampini } 250e6e9a74fSStefano Zampini 2514416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2529ae82921SPaul Mullowney { 2539ae82921SPaul Mullowney PetscErrorCode ierr; 254e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2559ae82921SPaul Mullowney PetscBool flg; 256a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2576e111a19SKarl Rupp 2589ae82921SPaul Mullowney PetscFunctionBegin; 259e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2609ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 26154da937aSStefano Zampini PetscBool transgen = cusparsestruct->transgen; 26254da937aSStefano Zampini 26354da937aSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264afb2bd1cSJunchao Zhang if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265afb2bd1cSJunchao Zhang 266e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269afb2bd1cSJunchao Zhang 2704c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279afb2bd1cSJunchao Zhang 280afb2bd1cSJunchao Zhang cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284afb2bd1cSJunchao Zhang 285afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289afb2bd1cSJunchao Zhang #endif 2904c87dfd4SPaul Mullowney } 2910af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2929ae82921SPaul Mullowney PetscFunctionReturn(0); 2939ae82921SPaul Mullowney } 2949ae82921SPaul Mullowney 2956fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2969ae82921SPaul Mullowney { 297*da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2989ae82921SPaul Mullowney PetscErrorCode ierr; 2999ae82921SPaul Mullowney 3009ae82921SPaul Mullowney PetscFunctionBegin; 301*da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3029ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3039ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3049ae82921SPaul Mullowney PetscFunctionReturn(0); 3059ae82921SPaul Mullowney } 3069ae82921SPaul Mullowney 3076fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3089ae82921SPaul Mullowney { 309*da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3109ae82921SPaul Mullowney PetscErrorCode ierr; 3119ae82921SPaul Mullowney 3129ae82921SPaul Mullowney PetscFunctionBegin; 313*da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3149ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3159ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3169ae82921SPaul Mullowney PetscFunctionReturn(0); 3179ae82921SPaul Mullowney } 3189ae82921SPaul Mullowney 319087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 320087f3262SPaul Mullowney { 321*da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 322087f3262SPaul Mullowney PetscErrorCode ierr; 323087f3262SPaul Mullowney 324087f3262SPaul Mullowney PetscFunctionBegin; 325*da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 326087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 327087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 328087f3262SPaul Mullowney PetscFunctionReturn(0); 329087f3262SPaul Mullowney } 330087f3262SPaul Mullowney 331087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 332087f3262SPaul Mullowney { 333*da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 334087f3262SPaul Mullowney PetscErrorCode ierr; 335087f3262SPaul Mullowney 336087f3262SPaul Mullowney PetscFunctionBegin; 337*da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 338087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 339087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 340087f3262SPaul Mullowney PetscFunctionReturn(0); 341087f3262SPaul Mullowney } 342087f3262SPaul Mullowney 343087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3449ae82921SPaul Mullowney { 3459ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3469ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3479ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 348aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3499ae82921SPaul Mullowney cusparseStatus_t stat; 3509ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3519ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3529ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3539ae82921SPaul Mullowney PetscScalar *AALo; 3549ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 355b175d8bbSPaul Mullowney PetscErrorCode ierr; 35657d48284SJunchao Zhang cudaError_t cerr; 3579ae82921SPaul Mullowney 3589ae82921SPaul Mullowney PetscFunctionBegin; 359cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 360c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3619ae82921SPaul Mullowney try { 3629ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3639ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 364*da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 365*da79fbbcSStefano Zampini if (!loTriFactor) { 3669ae82921SPaul Mullowney 3679ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 36857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 36957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3709ae82921SPaul Mullowney 3719ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3729ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3739ae82921SPaul Mullowney AiLo[n] = nzLower; 3749ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3759ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3769ae82921SPaul Mullowney v = aa; 3779ae82921SPaul Mullowney vi = aj; 3789ae82921SPaul Mullowney offset = 1; 3799ae82921SPaul Mullowney rowOffset= 1; 3809ae82921SPaul Mullowney for (i=1; i<n; i++) { 3819ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 382e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3839ae82921SPaul Mullowney AiLo[i] = rowOffset; 3849ae82921SPaul Mullowney rowOffset += nz+1; 3859ae82921SPaul Mullowney 386580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 387580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3889ae82921SPaul Mullowney 3899ae82921SPaul Mullowney offset += nz; 3909ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3919ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3929ae82921SPaul Mullowney offset += 1; 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney v += nz; 3959ae82921SPaul Mullowney vi += nz; 3969ae82921SPaul Mullowney } 3972205254eSKarl Rupp 398aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 399*da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 400*da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 401aa372e3fSPaul Mullowney /* Create the matrix description */ 40257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 40357d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 405afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 406afb2bd1cSJunchao Zhang #else 40757d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 408afb2bd1cSJunchao Zhang #endif 40957d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 41057d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 411aa372e3fSPaul Mullowney 412aa372e3fSPaul Mullowney /* set the operation */ 413aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 414aa372e3fSPaul Mullowney 415aa372e3fSPaul Mullowney /* set the matrix */ 416aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 417aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 418aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 419aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 420aa372e3fSPaul Mullowney 421aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 422aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 423aa372e3fSPaul Mullowney 424aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 425aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 426aa372e3fSPaul Mullowney 427aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 428aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 429aa372e3fSPaul Mullowney 430afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 431*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 432afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 434afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 435afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 436afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 437afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 438afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 439afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 440afb2bd1cSJunchao Zhang #endif 441afb2bd1cSJunchao Zhang 442aa372e3fSPaul Mullowney /* perform the solve analysis */ 443aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 444aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 445aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 446afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 4471b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 448afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 449afb2bd1cSJunchao Zhang #endif 450afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 451*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 452*da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 453aa372e3fSPaul Mullowney 454*da79fbbcSStefano Zampini /* assign the pointer */ 455aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4562205254eSKarl Rupp 45757d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 45857d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4594863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 460*da79fbbcSStefano Zampini } else { /* update values only */ 461*da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 462*da79fbbcSStefano Zampini AALo[0] = 1.0; 463*da79fbbcSStefano Zampini v = aa; 464*da79fbbcSStefano Zampini vi = aj; 465*da79fbbcSStefano Zampini offset = 1; 466*da79fbbcSStefano Zampini for (i=1; i<n; i++) { 467*da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 468*da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 469*da79fbbcSStefano Zampini offset += nz; 470*da79fbbcSStefano Zampini AALo[offset] = 1.0; 471*da79fbbcSStefano Zampini offset += 1; 472*da79fbbcSStefano Zampini v += nz; 473*da79fbbcSStefano Zampini } 474*da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 475*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 476*da79fbbcSStefano Zampini } 477*da79fbbcSStefano Zampini cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 4789ae82921SPaul Mullowney } catch(char *ex) { 4799ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4809ae82921SPaul Mullowney } 4819ae82921SPaul Mullowney } 4829ae82921SPaul Mullowney PetscFunctionReturn(0); 4839ae82921SPaul Mullowney } 4849ae82921SPaul Mullowney 485087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4869ae82921SPaul Mullowney { 4879ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4889ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4899ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4919ae82921SPaul Mullowney cusparseStatus_t stat; 4929ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4939ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4949ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4959ae82921SPaul Mullowney PetscScalar *AAUp; 4969ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4979ae82921SPaul Mullowney PetscErrorCode ierr; 49857d48284SJunchao Zhang cudaError_t cerr; 4999ae82921SPaul Mullowney 5009ae82921SPaul Mullowney PetscFunctionBegin; 501cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 502c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5039ae82921SPaul Mullowney try { 5049ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5059ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 506*da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 507*da79fbbcSStefano Zampini if (!upTriFactor) { 5089ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 50957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 51057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5119ae82921SPaul Mullowney 5129ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5139ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5149ae82921SPaul Mullowney AiUp[n]=nzUpper; 5159ae82921SPaul Mullowney offset = nzUpper; 5169ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5179ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5189ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5199ae82921SPaul Mullowney 520e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5219ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5229ae82921SPaul Mullowney 523e057df02SPaul Mullowney /* decrement the offset */ 5249ae82921SPaul Mullowney offset -= (nz+1); 5259ae82921SPaul Mullowney 526e057df02SPaul Mullowney /* first, set the diagonal elements */ 5279ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 52809f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5299ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5309ae82921SPaul Mullowney 531580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 532580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5339ae82921SPaul Mullowney } 5342205254eSKarl Rupp 535aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 536*da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 537*da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5382205254eSKarl Rupp 539aa372e3fSPaul Mullowney /* Create the matrix description */ 54057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 54157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 543afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 544afb2bd1cSJunchao Zhang #else 54557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 546afb2bd1cSJunchao Zhang #endif 54757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 54857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 549aa372e3fSPaul Mullowney 550aa372e3fSPaul Mullowney /* set the operation */ 551aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 552aa372e3fSPaul Mullowney 553aa372e3fSPaul Mullowney /* set the matrix */ 554aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 555aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 556aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 557aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 558aa372e3fSPaul Mullowney 559aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 560aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 561aa372e3fSPaul Mullowney 562aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 563aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 564aa372e3fSPaul Mullowney 565aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 566aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 567aa372e3fSPaul Mullowney 568afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 569*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 570afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 573afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 576afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 577afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 578afb2bd1cSJunchao Zhang #endif 579afb2bd1cSJunchao Zhang 580aa372e3fSPaul Mullowney /* perform the solve analysis */ 581aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 582aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 5851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 587afb2bd1cSJunchao Zhang #endif 588afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 589*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 590*da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 591aa372e3fSPaul Mullowney 592*da79fbbcSStefano Zampini /* assign the pointer */ 593aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5942205254eSKarl Rupp 59557d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 59657d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 5974863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 598*da79fbbcSStefano Zampini } else { 599*da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 600*da79fbbcSStefano Zampini offset = nzUpper; 601*da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 602*da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 603*da79fbbcSStefano Zampini 604*da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 605*da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 606*da79fbbcSStefano Zampini 607*da79fbbcSStefano Zampini /* decrement the offset */ 608*da79fbbcSStefano Zampini offset -= (nz+1); 609*da79fbbcSStefano Zampini 610*da79fbbcSStefano Zampini /* first, set the diagonal elements */ 611*da79fbbcSStefano Zampini AAUp[offset] = 1./v[nz]; 612*da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 613*da79fbbcSStefano Zampini } 614*da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 615*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 616*da79fbbcSStefano Zampini } 617*da79fbbcSStefano Zampini cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 6189ae82921SPaul Mullowney } catch(char *ex) { 6199ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6209ae82921SPaul Mullowney } 6219ae82921SPaul Mullowney } 6229ae82921SPaul Mullowney PetscFunctionReturn(0); 6239ae82921SPaul Mullowney } 6249ae82921SPaul Mullowney 625087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6269ae82921SPaul Mullowney { 6279ae82921SPaul Mullowney PetscErrorCode ierr; 6289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6299ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6309ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6319ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6329ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6339ae82921SPaul Mullowney 6349ae82921SPaul Mullowney PetscFunctionBegin; 635*da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 636087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 637087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6382205254eSKarl Rupp 639*da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 640aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6419ae82921SPaul Mullowney 642c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 643e057df02SPaul Mullowney /* lower triangular indices */ 6449ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 645*da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 646*da79fbbcSStefano Zampini const PetscInt *r; 647*da79fbbcSStefano Zampini 648*da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 649aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 650aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6519ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 652*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 653*da79fbbcSStefano Zampini } 6549ae82921SPaul Mullowney 655e057df02SPaul Mullowney /* upper triangular indices */ 6569ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 657*da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 658*da79fbbcSStefano Zampini const PetscInt *c; 659*da79fbbcSStefano Zampini 660*da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 661aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 662aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6639ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 664*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 665*da79fbbcSStefano Zampini } 6669ae82921SPaul Mullowney PetscFunctionReturn(0); 6679ae82921SPaul Mullowney } 6689ae82921SPaul Mullowney 669087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 670087f3262SPaul Mullowney { 671087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 672087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 673aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 674aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 675087f3262SPaul Mullowney cusparseStatus_t stat; 676087f3262SPaul Mullowney PetscErrorCode ierr; 67757d48284SJunchao Zhang cudaError_t cerr; 678087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 679087f3262SPaul Mullowney PetscScalar *AAUp; 680087f3262SPaul Mullowney PetscScalar *AALo; 681087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 682087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 683087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 684087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 685087f3262SPaul Mullowney 686087f3262SPaul Mullowney PetscFunctionBegin; 687cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 688c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 689087f3262SPaul Mullowney try { 690*da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 691*da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 692*da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 693087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 69457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 69557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 696087f3262SPaul Mullowney 697087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 698087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 699087f3262SPaul Mullowney AiUp[n]=nzUpper; 700087f3262SPaul Mullowney offset = 0; 701087f3262SPaul Mullowney for (i=0; i<n; i++) { 702087f3262SPaul Mullowney /* set the pointers */ 703087f3262SPaul Mullowney v = aa + ai[i]; 704087f3262SPaul Mullowney vj = aj + ai[i]; 705087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 706087f3262SPaul Mullowney 707087f3262SPaul Mullowney /* first, set the diagonal elements */ 708087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 70909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 710087f3262SPaul Mullowney AiUp[i] = offset; 71109f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 712087f3262SPaul Mullowney 713087f3262SPaul Mullowney offset+=1; 714087f3262SPaul Mullowney if (nz>0) { 715f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 716580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 717087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 718087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 719087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 720087f3262SPaul Mullowney } 721087f3262SPaul Mullowney offset+=nz; 722087f3262SPaul Mullowney } 723087f3262SPaul Mullowney } 724087f3262SPaul Mullowney 725aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 726*da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 727*da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 728087f3262SPaul Mullowney 729aa372e3fSPaul Mullowney /* Create the matrix description */ 73057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 73157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 733afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 734afb2bd1cSJunchao Zhang #else 73557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 736afb2bd1cSJunchao Zhang #endif 73757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 73857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 739087f3262SPaul Mullowney 740aa372e3fSPaul Mullowney /* set the matrix */ 741aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 742aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 743aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 744aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 745aa372e3fSPaul Mullowney 746aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 747aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 748aa372e3fSPaul Mullowney 749aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 750aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 751aa372e3fSPaul Mullowney 752aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 753aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 754aa372e3fSPaul Mullowney 755afb2bd1cSJunchao Zhang /* set the operation */ 756afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 757afb2bd1cSJunchao Zhang 758afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 759*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 760afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 762afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 763afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 764afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 765afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 766afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 767afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 768afb2bd1cSJunchao Zhang #endif 769afb2bd1cSJunchao Zhang 770aa372e3fSPaul Mullowney /* perform the solve analysis */ 771aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 772aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 773aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 774afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 7751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 776afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 777afb2bd1cSJunchao Zhang #endif 778afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 779*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 780*da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 781aa372e3fSPaul Mullowney 782*da79fbbcSStefano Zampini /* assign the pointer */ 783aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 784aa372e3fSPaul Mullowney 785aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 786*da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 787*da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 788aa372e3fSPaul Mullowney 789aa372e3fSPaul Mullowney /* Create the matrix description */ 79057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 79157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 793afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 794afb2bd1cSJunchao Zhang #else 79557d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 796afb2bd1cSJunchao Zhang #endif 79757d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 79857d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 799aa372e3fSPaul Mullowney 800aa372e3fSPaul Mullowney /* set the operation */ 801aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 802aa372e3fSPaul Mullowney 803aa372e3fSPaul Mullowney /* set the matrix */ 804aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 805aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 806aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 807aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 808aa372e3fSPaul Mullowney 809aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 810aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 811aa372e3fSPaul Mullowney 812aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 813aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 814aa372e3fSPaul Mullowney 815aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 816aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 817aa372e3fSPaul Mullowney 818afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 819*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 820afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8211b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 822afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 823afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 824afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 825afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 826afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 827afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 828afb2bd1cSJunchao Zhang #endif 829afb2bd1cSJunchao Zhang 830aa372e3fSPaul Mullowney /* perform the solve analysis */ 831aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 832aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 833aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 834afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 8351b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 836afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 837afb2bd1cSJunchao Zhang #endif 838afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 839*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 840*da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 841aa372e3fSPaul Mullowney 842*da79fbbcSStefano Zampini /* assign the pointer */ 843aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 844087f3262SPaul Mullowney 845*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 84657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 84757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 848*da79fbbcSStefano Zampini } else { 849*da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 850*da79fbbcSStefano Zampini offset = 0; 851*da79fbbcSStefano Zampini for (i=0; i<n; i++) { 852*da79fbbcSStefano Zampini /* set the pointers */ 853*da79fbbcSStefano Zampini v = aa + ai[i]; 854*da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 855*da79fbbcSStefano Zampini 856*da79fbbcSStefano Zampini /* first, set the diagonal elements */ 857*da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 858*da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 859*da79fbbcSStefano Zampini 860*da79fbbcSStefano Zampini offset+=1; 861*da79fbbcSStefano Zampini if (nz>0) { 862*da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 863*da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 864*da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 865*da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 866*da79fbbcSStefano Zampini } 867*da79fbbcSStefano Zampini offset+=nz; 868*da79fbbcSStefano Zampini } 869*da79fbbcSStefano Zampini } 870*da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 871*da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 872*da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 873*da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 874*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 875*da79fbbcSStefano Zampini } 87657d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 87757d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 878087f3262SPaul Mullowney } catch(char *ex) { 879087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 880087f3262SPaul Mullowney } 881087f3262SPaul Mullowney } 882087f3262SPaul Mullowney PetscFunctionReturn(0); 883087f3262SPaul Mullowney } 884087f3262SPaul Mullowney 885087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8869ae82921SPaul Mullowney { 8879ae82921SPaul Mullowney PetscErrorCode ierr; 888087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 889087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 890087f3262SPaul Mullowney IS ip = a->row; 891087f3262SPaul Mullowney PetscBool perm_identity; 892087f3262SPaul Mullowney PetscInt n = A->rmap->n; 893087f3262SPaul Mullowney 894087f3262SPaul Mullowney PetscFunctionBegin; 895*da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 896087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 897*da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 898aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 899aa372e3fSPaul Mullowney 900*da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 901*da79fbbcSStefano Zampini 902087f3262SPaul Mullowney /* lower triangular indices */ 903087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 904087f3262SPaul Mullowney if (!perm_identity) { 9054e4bbfaaSStefano Zampini IS iip; 906*da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9074e4bbfaaSStefano Zampini 9084e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9094e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 910*da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 911aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 912aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 913aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9144e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9154e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9164e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 917087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 918*da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 919*da79fbbcSStefano Zampini } 920087f3262SPaul Mullowney PetscFunctionReturn(0); 921087f3262SPaul Mullowney } 922087f3262SPaul Mullowney 9236fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 9249ae82921SPaul Mullowney { 9259ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 9269ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 9279ae82921SPaul Mullowney PetscBool row_identity,col_identity; 928b175d8bbSPaul Mullowney PetscErrorCode ierr; 9299ae82921SPaul Mullowney 9309ae82921SPaul Mullowney PetscFunctionBegin; 9319ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 932ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 933e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 9349ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 9359ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 936bda325fcSPaul Mullowney if (row_identity && col_identity) { 937bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 938bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9394e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9404e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 941bda325fcSPaul Mullowney } else { 942bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 943bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9444e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9454e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 946bda325fcSPaul Mullowney } 9478dc1d2a3SPaul Mullowney 948e057df02SPaul Mullowney /* get the triangular factors */ 949087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 9509ae82921SPaul Mullowney PetscFunctionReturn(0); 9519ae82921SPaul Mullowney } 9529ae82921SPaul Mullowney 953087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 954087f3262SPaul Mullowney { 955087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 956087f3262SPaul Mullowney IS ip = b->row; 957087f3262SPaul Mullowney PetscBool perm_identity; 958b175d8bbSPaul Mullowney PetscErrorCode ierr; 959087f3262SPaul Mullowney 960087f3262SPaul Mullowney PetscFunctionBegin; 961087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 962ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 963087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 964087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 965087f3262SPaul Mullowney if (perm_identity) { 966087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 967087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9684e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9694e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 970087f3262SPaul Mullowney } else { 971087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 972087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9734e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9744e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 975087f3262SPaul Mullowney } 976087f3262SPaul Mullowney 977087f3262SPaul Mullowney /* get the triangular factors */ 978087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 979087f3262SPaul Mullowney PetscFunctionReturn(0); 980087f3262SPaul Mullowney } 9819ae82921SPaul Mullowney 982b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 983bda325fcSPaul Mullowney { 984bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 985aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 986aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 987*da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 988*da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 989bda325fcSPaul Mullowney cusparseStatus_t stat; 990aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 991aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 992aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 993aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 9941b0a6780SStefano Zampini cudaError_t cerr; 995*da79fbbcSStefano Zampini PetscErrorCode ierr; 996b175d8bbSPaul Mullowney 997bda325fcSPaul Mullowney PetscFunctionBegin; 998aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 999*da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1000*da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1001aa372e3fSPaul Mullowney 1002aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1003aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1004aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1005aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1006aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1007aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1008aa372e3fSPaul Mullowney 1009aa372e3fSPaul Mullowney /* Create the matrix description */ 101057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 101157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 101257d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 101357d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 101457d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1015aa372e3fSPaul Mullowney 1016aa372e3fSPaul Mullowney /* set the operation */ 1017aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1018aa372e3fSPaul Mullowney 1019aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1020aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1021afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1022afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1023aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1024afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1025afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1026afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1027aa372e3fSPaul Mullowney 1028aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1029afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1030afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1031afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1032afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1033afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1034afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1035afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1036afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1037afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1038afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10391b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1040afb2bd1cSJunchao Zhang #endif 1041afb2bd1cSJunchao Zhang 1042*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1043aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1046aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1047aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1048aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1049afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1050afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1051afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1052afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1053afb2bd1cSJunchao Zhang #else 1054afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1055afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1056afb2bd1cSJunchao Zhang #endif 1057afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1058*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1059*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1060aa372e3fSPaul Mullowney 1061afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1062*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1063afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1065afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1066afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1067afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1069afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1070afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1071afb2bd1cSJunchao Zhang #endif 1072afb2bd1cSJunchao Zhang 1073afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1074aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1075afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 10781b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1079afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1080afb2bd1cSJunchao Zhang #endif 1081afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1082*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1083*da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084aa372e3fSPaul Mullowney 1085*da79fbbcSStefano Zampini /* assign the pointer */ 1086aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1087aa372e3fSPaul Mullowney 1088aa372e3fSPaul Mullowney /*********************************************/ 1089aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1090aa372e3fSPaul Mullowney /*********************************************/ 1091aa372e3fSPaul Mullowney 1092aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1093*da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1094*da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1095aa372e3fSPaul Mullowney 1096aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1097aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1098aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1099aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1100aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1101aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1102aa372e3fSPaul Mullowney 1103aa372e3fSPaul Mullowney /* Create the matrix description */ 110457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 110557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 110657d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 110757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 110857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1109aa372e3fSPaul Mullowney 1110aa372e3fSPaul Mullowney /* set the operation */ 1111aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1112aa372e3fSPaul Mullowney 1113aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1114aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1115afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1116afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1117aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1118afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1119afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1120afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1121aa372e3fSPaul Mullowney 1122aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1123afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1124afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1125afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1126afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1127afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1128afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1129afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1130afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1131afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1132afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1133afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1134afb2bd1cSJunchao Zhang #endif 1135afb2bd1cSJunchao Zhang 1136*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1137aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1138aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1140aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1141aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1142aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1143afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1145afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1146afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1147afb2bd1cSJunchao Zhang #else 1148afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1149afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1150afb2bd1cSJunchao Zhang #endif 1151afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1152*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1153*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1154aa372e3fSPaul Mullowney 1155afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1156*da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1157afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1159afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1160afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1161afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1163afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1164afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1165afb2bd1cSJunchao Zhang #endif 1166afb2bd1cSJunchao Zhang 1167afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1168aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 11721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1173afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1174afb2bd1cSJunchao Zhang #endif 1175afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1176*da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1177*da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1178aa372e3fSPaul Mullowney 1179*da79fbbcSStefano Zampini /* assign the pointer */ 1180aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1181bda325fcSPaul Mullowney PetscFunctionReturn(0); 1182bda325fcSPaul Mullowney } 1183bda325fcSPaul Mullowney 1184b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1185bda325fcSPaul Mullowney { 1186aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1187aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1188aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1189bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1190bda325fcSPaul Mullowney cusparseStatus_t stat; 1191aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1192b06137fdSPaul Mullowney cudaError_t err; 119385ba7357SStefano Zampini PetscErrorCode ierr; 1194b175d8bbSPaul Mullowney 1195bda325fcSPaul Mullowney PetscFunctionBegin; 119685ba7357SStefano Zampini if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 119785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 119885ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 119985ba7357SStefano Zampini /* create cusparse matrix */ 1200aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 120157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1202aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 120357d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 120457d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1205aa372e3fSPaul Mullowney 1206b06137fdSPaul Mullowney /* set alpha and beta */ 1207afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12087656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12097656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1210afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12117656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12127656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 121357d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1214b06137fdSPaul Mullowney 1215aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1216aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1217aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 1218554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1219554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1220aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1221a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1222aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1223aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1224a3fdcf43SKarl Rupp 122581902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 122681902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1227afb2bd1cSJunchao Zhang 122881902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 1229afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1230afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1231afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1232afb2bd1cSJunchao Zhang matrix->values->data().get(), 1233afb2bd1cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1234afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1235afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1236afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1237afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1238afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1239afb2bd1cSJunchao Zhang err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1240afb2bd1cSJunchao Zhang #endif 1241afb2bd1cSJunchao Zhang 1242a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1243a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 1244aa372e3fSPaul Mullowney matrix->values->data().get(), 124581902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1246aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1247aa372e3fSPaul Mullowney matrixT->values->data().get(), 1248afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1249afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1251afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1252afb2bd1cSJunchao Zhang #else 1253afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1254afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1255afb2bd1cSJunchao Zhang #endif 1256afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1257aa372e3fSPaul Mullowney matstructT->mat = matrixT; 1258afb2bd1cSJunchao Zhang 1259afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1260afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1261afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1264afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1266afb2bd1cSJunchao Zhang #endif 1267aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1268afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1269afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1270afb2bd1cSJunchao Zhang #else 1271aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 127251c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 127351c6d536SStefano Zampini /* First convert HYB to CSR */ 1274aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1275aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1276aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1277aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1278aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1279aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1280aa372e3fSPaul Mullowney 1281aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1282aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1283aa372e3fSPaul Mullowney temp->values->data().get(), 1284aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 128557d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1286aa372e3fSPaul Mullowney 1287aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1288aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1289aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1290aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1291aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1292aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1293aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1294aa372e3fSPaul Mullowney 1295aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1296aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1297aa372e3fSPaul Mullowney temp->values->data().get(), 1298aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1299aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1300aa372e3fSPaul Mullowney tempT->values->data().get(), 1301aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1302aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 130357d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1304aa372e3fSPaul Mullowney 1305aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1306aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 130757d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1308aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1309aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1310aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1311aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1312aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1313aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 131457d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1315aa372e3fSPaul Mullowney 1316aa372e3fSPaul Mullowney /* assign the pointer */ 1317aa372e3fSPaul Mullowney matstructT->mat = hybMat; 1318aa372e3fSPaul Mullowney /* delete temporaries */ 1319aa372e3fSPaul Mullowney if (tempT) { 1320aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1321aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1322aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1323aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1324087f3262SPaul Mullowney } 1325aa372e3fSPaul Mullowney if (temp) { 1326aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1327aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1328aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1329aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1330aa372e3fSPaul Mullowney } 1331afb2bd1cSJunchao Zhang #endif 1332aa372e3fSPaul Mullowney } 133305035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 133485ba7357SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 133585ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1336213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1337213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1338aa372e3fSPaul Mullowney /* assign the pointer */ 1339aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1340bda325fcSPaul Mullowney PetscFunctionReturn(0); 1341bda325fcSPaul Mullowney } 1342bda325fcSPaul Mullowney 13434e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 13446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1345bda325fcSPaul Mullowney { 1346c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1347465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1348465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1349465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1350465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1351bda325fcSPaul Mullowney cusparseStatus_t stat; 1352bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1353aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1354aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1355aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1356b175d8bbSPaul Mullowney PetscErrorCode ierr; 135757d48284SJunchao Zhang cudaError_t cerr; 1358bda325fcSPaul Mullowney 1359bda325fcSPaul Mullowney PetscFunctionBegin; 1360aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1361aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1362bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1363aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1364aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1365bda325fcSPaul Mullowney } 1366bda325fcSPaul Mullowney 1367bda325fcSPaul Mullowney /* Get the GPU pointers */ 1368c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1369c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1370c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1371c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1372bda325fcSPaul Mullowney 13737a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1374aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1375c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1376c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1377c41cb2e2SAlejandro Lamas Daviña xGPU); 1378aa372e3fSPaul Mullowney 1379aa372e3fSPaul Mullowney /* First, solve U */ 1380aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1381afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 13821b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1383afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1384afb2bd1cSJunchao Zhang #endif 1385afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1386aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1387aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1388aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1389aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1390afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 13911b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1392afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1393afb2bd1cSJunchao Zhang #endif 1394afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1395aa372e3fSPaul Mullowney 1396aa372e3fSPaul Mullowney /* Then, solve L */ 1397aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1398afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 13991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1400afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1401afb2bd1cSJunchao Zhang #endif 1402afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1403aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1404aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1405aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1406aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1407afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1409afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1410afb2bd1cSJunchao Zhang #endif 1411afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1412aa372e3fSPaul Mullowney 1413aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1414c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1415c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1416aa372e3fSPaul Mullowney tempGPU->begin()); 1417aa372e3fSPaul Mullowney 1418aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1419c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1420bda325fcSPaul Mullowney 1421bda325fcSPaul Mullowney /* restore */ 1422c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1423c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 142405035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1425661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1426958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1427bda325fcSPaul Mullowney PetscFunctionReturn(0); 1428bda325fcSPaul Mullowney } 1429bda325fcSPaul Mullowney 14306fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1431bda325fcSPaul Mullowney { 1432465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1433465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1434bda325fcSPaul Mullowney cusparseStatus_t stat; 1435bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1436aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1437aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1438aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1439b175d8bbSPaul Mullowney PetscErrorCode ierr; 144057d48284SJunchao Zhang cudaError_t cerr; 1441bda325fcSPaul Mullowney 1442bda325fcSPaul Mullowney PetscFunctionBegin; 1443aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1444aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1445bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1446aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1447aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1448bda325fcSPaul Mullowney } 1449bda325fcSPaul Mullowney 1450bda325fcSPaul Mullowney /* Get the GPU pointers */ 1451c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1452c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1453bda325fcSPaul Mullowney 14547a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1455aa372e3fSPaul Mullowney /* First, solve U */ 1456aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1457afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1459afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1460afb2bd1cSJunchao Zhang #endif 1461afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1462aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1463aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1464aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1465aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1466afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 14671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1468afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1469afb2bd1cSJunchao Zhang #endif 1470afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1471aa372e3fSPaul Mullowney 1472aa372e3fSPaul Mullowney /* Then, solve L */ 1473aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1474afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1476afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1477afb2bd1cSJunchao Zhang #endif 1478afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1479aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1480aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1481aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1482aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1483afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1485afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1486afb2bd1cSJunchao Zhang #endif 1487afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1488bda325fcSPaul Mullowney 1489bda325fcSPaul Mullowney /* restore */ 1490c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1491c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 149205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1493661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1494958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1495bda325fcSPaul Mullowney PetscFunctionReturn(0); 1496bda325fcSPaul Mullowney } 1497bda325fcSPaul Mullowney 14986fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 14999ae82921SPaul Mullowney { 1500465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1501465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1502465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1503465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15049ae82921SPaul Mullowney cusparseStatus_t stat; 15059ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1506aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1507aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1508aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1509b175d8bbSPaul Mullowney PetscErrorCode ierr; 151057d48284SJunchao Zhang cudaError_t cerr; 15119ae82921SPaul Mullowney 15129ae82921SPaul Mullowney PetscFunctionBegin; 1513ebc8f436SDominic Meiser 1514e057df02SPaul Mullowney /* Get the GPU pointers */ 1515c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1516c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1517c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1518c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15199ae82921SPaul Mullowney 15207a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1521aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1522c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1523c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15244e4bbfaaSStefano Zampini tempGPU->begin()); 1525aa372e3fSPaul Mullowney 1526aa372e3fSPaul Mullowney /* Next, solve L */ 1527aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1528afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1530afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1531afb2bd1cSJunchao Zhang #endif 1532afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1533aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1534aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1535aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1536aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1537afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 15381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1539afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1540afb2bd1cSJunchao Zhang #endif 1541afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1542aa372e3fSPaul Mullowney 1543aa372e3fSPaul Mullowney /* Then, solve U */ 1544aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1545afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1547afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1548afb2bd1cSJunchao Zhang #endif 1549afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1550aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1551aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1552aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1553aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1554afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 15551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1556afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1557afb2bd1cSJunchao Zhang #endif 1558afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1559aa372e3fSPaul Mullowney 15604e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 15614e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 15624e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 15634e4bbfaaSStefano Zampini xGPU); 15649ae82921SPaul Mullowney 1565c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1566c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 156705035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1568661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 15709ae82921SPaul Mullowney PetscFunctionReturn(0); 15719ae82921SPaul Mullowney } 15729ae82921SPaul Mullowney 15736fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 15749ae82921SPaul Mullowney { 1575465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1576465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 15779ae82921SPaul Mullowney cusparseStatus_t stat; 15789ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1579aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1580aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1581aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1582b175d8bbSPaul Mullowney PetscErrorCode ierr; 158357d48284SJunchao Zhang cudaError_t cerr; 15849ae82921SPaul Mullowney 15859ae82921SPaul Mullowney PetscFunctionBegin; 1586e057df02SPaul Mullowney /* Get the GPU pointers */ 1587c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1588c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 15899ae82921SPaul Mullowney 15907a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1591aa372e3fSPaul Mullowney /* First, solve L */ 1592aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1593afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1595afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1596afb2bd1cSJunchao Zhang #endif 1597afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1598aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1599aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1600aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1601aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1602afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 16031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1605afb2bd1cSJunchao Zhang #endif 1606afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1607aa372e3fSPaul Mullowney 1608aa372e3fSPaul Mullowney /* Next, solve U */ 1609aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1610afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1613afb2bd1cSJunchao Zhang #endif 1614afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1615aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1616aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1617aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1618aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1619afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 16201b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1621afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1622afb2bd1cSJunchao Zhang #endif 1623afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 16249ae82921SPaul Mullowney 1625c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1626c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 162705035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1628661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1629958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16309ae82921SPaul Mullowney PetscFunctionReturn(0); 16319ae82921SPaul Mullowney } 16329ae82921SPaul Mullowney 16337e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 16347e8381f9SStefano Zampini { 16357e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16367e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 16377e8381f9SStefano Zampini cudaError_t cerr; 16387e8381f9SStefano Zampini PetscErrorCode ierr; 16397e8381f9SStefano Zampini 16407e8381f9SStefano Zampini PetscFunctionBegin; 16417e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 16427e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 16437e8381f9SStefano Zampini 16447e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16457e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 16467e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 16477e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 16487e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16497e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 16507e8381f9SStefano Zampini } 16517e8381f9SStefano Zampini PetscFunctionReturn(0); 16527e8381f9SStefano Zampini } 16537e8381f9SStefano Zampini 16547e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 16557e8381f9SStefano Zampini { 16567e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16577e8381f9SStefano Zampini PetscErrorCode ierr; 16587e8381f9SStefano Zampini 16597e8381f9SStefano Zampini PetscFunctionBegin; 16607e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 16617e8381f9SStefano Zampini *array = a->a; 16627e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 16637e8381f9SStefano Zampini PetscFunctionReturn(0); 16647e8381f9SStefano Zampini } 16657e8381f9SStefano Zampini 16666fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 16679ae82921SPaul Mullowney { 1668aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 16697c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 16709ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1671213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 16729ae82921SPaul Mullowney PetscErrorCode ierr; 1673aa372e3fSPaul Mullowney cusparseStatus_t stat; 1674b06137fdSPaul Mullowney cudaError_t err; 16759ae82921SPaul Mullowney 16769ae82921SPaul Mullowney PetscFunctionBegin; 167795639643SRichard Tran Mills if (A->boundtocpu) PetscFunctionReturn(0); 1678c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 167981902715SJunchao Zhang if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 168081902715SJunchao Zhang /* Copy values only */ 1681afb2bd1cSJunchao Zhang CsrMatrix *matrix,*matrixT; 1682afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 168385ba7357SStefano Zampini 168485ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1685afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 168605035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 16874863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 168885ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 168981902715SJunchao Zhang 169081902715SJunchao Zhang /* Update matT when it was built before */ 169181902715SJunchao Zhang if (cusparsestruct->matTranspose) { 169281902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1693afb2bd1cSJunchao Zhang matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 169485ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 169581902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1696afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1697afb2bd1cSJunchao Zhang matrix->values->data().get(), 169881902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1699afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1700afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1701afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1702afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1703afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1704afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1705afb2bd1cSJunchao Zhang #else 1706afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1707afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1708afb2bd1cSJunchao Zhang #endif 1709afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 171005035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 171185ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 171281902715SJunchao Zhang } 171334d6c7a5SJose E. Roman } else { 171485ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17157c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 17167c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 17177c700b8dSJunchao Zhang delete cusparsestruct->workVector; 171881902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 17199ae82921SPaul Mullowney try { 17209ae82921SPaul Mullowney if (a->compressedrow.use) { 17219ae82921SPaul Mullowney m = a->compressedrow.nrows; 17229ae82921SPaul Mullowney ii = a->compressedrow.i; 17239ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 17249ae82921SPaul Mullowney } else { 1725213423ffSJunchao Zhang m = A->rmap->n; 1726213423ffSJunchao Zhang ii = a->i; 1727e6e9a74fSStefano Zampini ridx = NULL; 17289ae82921SPaul Mullowney } 1729213423ffSJunchao Zhang cusparsestruct->nrows = m; 17309ae82921SPaul Mullowney 173185ba7357SStefano Zampini /* create cusparse matrix */ 1732aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 173357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 173457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 173557d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 17369ae82921SPaul Mullowney 1737afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 17387656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 17397656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1740afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17417656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17427656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 174357d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1744b06137fdSPaul Mullowney 1745aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1746aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1747aa372e3fSPaul Mullowney /* set the matrix */ 1748afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1749afb2bd1cSJunchao Zhang mat->num_rows = m; 1750afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1751afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1752afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1753afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 17549ae82921SPaul Mullowney 1755afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1756afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1757aa372e3fSPaul Mullowney 1758afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1759afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1760aa372e3fSPaul Mullowney 1761aa372e3fSPaul Mullowney /* assign the pointer */ 1762afb2bd1cSJunchao Zhang matstruct->mat = mat; 1763afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1764afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1765afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1766afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1767afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1768afb2bd1cSJunchao Zhang mat->values->data().get(), 1769afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1770afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1771afb2bd1cSJunchao Zhang } 1772afb2bd1cSJunchao Zhang #endif 1773aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1774afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1775afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1776afb2bd1cSJunchao Zhang #else 1777afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1778afb2bd1cSJunchao Zhang mat->num_rows = m; 1779afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1780afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1781afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1782afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1783aa372e3fSPaul Mullowney 1784afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1785afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1786aa372e3fSPaul Mullowney 1787afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1788afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1789aa372e3fSPaul Mullowney 1790aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 179157d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1792aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1793aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1794afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1795afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1796afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1797afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 179857d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1799aa372e3fSPaul Mullowney /* assign the pointer */ 1800aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1801aa372e3fSPaul Mullowney 1802afb2bd1cSJunchao Zhang if (mat) { 1803afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1804afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1805afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1806afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1807087f3262SPaul Mullowney } 1808afb2bd1cSJunchao Zhang #endif 1809087f3262SPaul Mullowney } 1810ca45077fSPaul Mullowney 1811aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1812213423ffSJunchao Zhang if (a->compressedrow.use) { 1813213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1814aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1815aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1816213423ffSJunchao Zhang tmp = m; 1817213423ffSJunchao Zhang } else { 1818213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1819213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1820213423ffSJunchao Zhang tmp = 0; 1821213423ffSJunchao Zhang } 1822213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1823aa372e3fSPaul Mullowney 1824aa372e3fSPaul Mullowney /* assign the pointer */ 1825aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 18269ae82921SPaul Mullowney } catch(char *ex) { 18279ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18289ae82921SPaul Mullowney } 182905035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 183085ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 183134d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 183234d6c7a5SJose E. Roman } 1833c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 18349ae82921SPaul Mullowney } 18359ae82921SPaul Mullowney PetscFunctionReturn(0); 18369ae82921SPaul Mullowney } 18379ae82921SPaul Mullowney 1838c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1839aa372e3fSPaul Mullowney { 1840aa372e3fSPaul Mullowney template <typename Tuple> 1841aa372e3fSPaul Mullowney __host__ __device__ 1842aa372e3fSPaul Mullowney void operator()(Tuple t) 1843aa372e3fSPaul Mullowney { 1844aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1845aa372e3fSPaul Mullowney } 1846aa372e3fSPaul Mullowney }; 1847aa372e3fSPaul Mullowney 18487e8381f9SStefano Zampini struct VecCUDAEquals 18497e8381f9SStefano Zampini { 18507e8381f9SStefano Zampini template <typename Tuple> 18517e8381f9SStefano Zampini __host__ __device__ 18527e8381f9SStefano Zampini void operator()(Tuple t) 18537e8381f9SStefano Zampini { 18547e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 18557e8381f9SStefano Zampini } 18567e8381f9SStefano Zampini }; 18577e8381f9SStefano Zampini 1858e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1859e6e9a74fSStefano Zampini { 1860e6e9a74fSStefano Zampini template <typename Tuple> 1861e6e9a74fSStefano Zampini __host__ __device__ 1862e6e9a74fSStefano Zampini void operator()(Tuple t) 1863e6e9a74fSStefano Zampini { 1864e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1865e6e9a74fSStefano Zampini } 1866e6e9a74fSStefano Zampini }; 1867e6e9a74fSStefano Zampini 1868afb2bd1cSJunchao Zhang struct MatMatCusparse { 1869ccdfe979SStefano Zampini PetscBool cisdense; 1870ccdfe979SStefano Zampini PetscScalar *Bt; 1871ccdfe979SStefano Zampini Mat X; 1872afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1873afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1874afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1875afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1876afb2bd1cSJunchao Zhang size_t spmmBufferSize; 1877afb2bd1cSJunchao Zhang void *spmmBuffer; 1878afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1879afb2bd1cSJunchao Zhang #endif 1880afb2bd1cSJunchao Zhang }; 1881ccdfe979SStefano Zampini 1882ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1883ccdfe979SStefano Zampini { 1884ccdfe979SStefano Zampini PetscErrorCode ierr; 1885ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1886ccdfe979SStefano Zampini cudaError_t cerr; 1887ccdfe979SStefano Zampini 1888ccdfe979SStefano Zampini PetscFunctionBegin; 1889ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1890afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1891afb2bd1cSJunchao Zhang cusparseStatus_t stat; 1892afb2bd1cSJunchao Zhang if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1893afb2bd1cSJunchao Zhang if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1894afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1895afb2bd1cSJunchao Zhang #endif 1896ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1897ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1898ccdfe979SStefano Zampini PetscFunctionReturn(0); 1899ccdfe979SStefano Zampini } 1900ccdfe979SStefano Zampini 1901ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1902ccdfe979SStefano Zampini 1903ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1904ccdfe979SStefano Zampini { 1905ccdfe979SStefano Zampini Mat_Product *product = C->product; 1906ccdfe979SStefano Zampini Mat A,B; 1907afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1908ccdfe979SStefano Zampini PetscBool flg,biscuda; 1909ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1910ccdfe979SStefano Zampini cusparseStatus_t stat; 1911ccdfe979SStefano Zampini cusparseOperation_t opA; 1912ccdfe979SStefano Zampini const PetscScalar *barray; 1913ccdfe979SStefano Zampini PetscScalar *carray; 1914ccdfe979SStefano Zampini PetscErrorCode ierr; 1915ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1916ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1917ccdfe979SStefano Zampini CsrMatrix *csrmat; 1918afb2bd1cSJunchao Zhang cudaError_t cerr; 1919ccdfe979SStefano Zampini 1920ccdfe979SStefano Zampini PetscFunctionBegin; 1921ccdfe979SStefano Zampini MatCheckProduct(C,1); 1922ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1923ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1924ccdfe979SStefano Zampini A = product->A; 1925ccdfe979SStefano Zampini B = product->B; 1926ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1927ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1928ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1929ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1930ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1931ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1932ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1933ccdfe979SStefano Zampini switch (product->type) { 1934ccdfe979SStefano Zampini case MATPRODUCT_AB: 1935ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1936ccdfe979SStefano Zampini mat = cusp->mat; 1937ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1938ccdfe979SStefano Zampini m = A->rmap->n; 1939ccdfe979SStefano Zampini n = B->cmap->n; 1940ccdfe979SStefano Zampini break; 1941ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1942e6e9a74fSStefano Zampini if (!cusp->transgen) { 1943e6e9a74fSStefano Zampini mat = cusp->mat; 1944e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1945e6e9a74fSStefano Zampini } else { 1946ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1947ccdfe979SStefano Zampini mat = cusp->matTranspose; 1948ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1949e6e9a74fSStefano Zampini } 1950ccdfe979SStefano Zampini m = A->cmap->n; 1951ccdfe979SStefano Zampini n = B->cmap->n; 1952ccdfe979SStefano Zampini break; 1953ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1954ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1955ccdfe979SStefano Zampini mat = cusp->mat; 1956ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1957ccdfe979SStefano Zampini m = A->rmap->n; 1958ccdfe979SStefano Zampini n = B->rmap->n; 1959ccdfe979SStefano Zampini break; 1960ccdfe979SStefano Zampini default: 1961ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1962ccdfe979SStefano Zampini } 1963ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1964ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1965ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1966ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1967afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1968ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1969afb2bd1cSJunchao Zhang 1970ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1971c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1972c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1973c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1974c8378d12SStefano Zampini } else { 1975c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1976c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1977c8378d12SStefano Zampini } 1978c8378d12SStefano Zampini 1979c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1980afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1981afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1982afb2bd1cSJunchao Zhang /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1983afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1984afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1985afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 1986afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1987afb2bd1cSJunchao Zhang mmdata->Blda = blda; 1988afb2bd1cSJunchao Zhang } 1989c8378d12SStefano Zampini 1990afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1991afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1992afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1993afb2bd1cSJunchao Zhang mmdata->Clda = clda; 1994afb2bd1cSJunchao Zhang } 1995afb2bd1cSJunchao Zhang 1996afb2bd1cSJunchao Zhang if (!mat->matDescr) { 1997afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 1998afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1999afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2000afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2001afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2002afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2003afb2bd1cSJunchao Zhang } 2004afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2005afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2006afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2007afb2bd1cSJunchao Zhang cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 2008afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 2009afb2bd1cSJunchao Zhang cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 2010afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2011afb2bd1cSJunchao Zhang } else { 2012afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2013afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2014afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2015afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2016afb2bd1cSJunchao Zhang } 2017afb2bd1cSJunchao Zhang 2018afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2019afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2020afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2021afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2022afb2bd1cSJunchao Zhang cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 2023afb2bd1cSJunchao Zhang #else 2024afb2bd1cSJunchao Zhang PetscInt k; 2025afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2026ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2027ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2028ccdfe979SStefano Zampini cublasStatus_t cerr; 2029ccdfe979SStefano Zampini 2030ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2031ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2032ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2033ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2034ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2035ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2036ccdfe979SStefano Zampini blda = B->cmap->n; 2037afb2bd1cSJunchao Zhang k = B->cmap->n; 2038afb2bd1cSJunchao Zhang } else { 2039afb2bd1cSJunchao Zhang k = B->rmap->n; 2040ccdfe979SStefano Zampini } 2041ccdfe979SStefano Zampini 2042afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2043ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2044afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2045ccdfe979SStefano Zampini csrmat->values->data().get(), 2046ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2047ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2048ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2049ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2050afb2bd1cSJunchao Zhang #endif 2051afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2052c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2053c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2054ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2055ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2056ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2057ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2058ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2059ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2060ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2061ccdfe979SStefano Zampini } else { 2062ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2063ccdfe979SStefano Zampini } 2064ccdfe979SStefano Zampini if (mmdata->cisdense) { 2065ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2066ccdfe979SStefano Zampini } 2067ccdfe979SStefano Zampini if (!biscuda) { 2068ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2069ccdfe979SStefano Zampini } 2070ccdfe979SStefano Zampini PetscFunctionReturn(0); 2071ccdfe979SStefano Zampini } 2072ccdfe979SStefano Zampini 2073ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2074ccdfe979SStefano Zampini { 2075ccdfe979SStefano Zampini Mat_Product *product = C->product; 2076ccdfe979SStefano Zampini Mat A,B; 2077ccdfe979SStefano Zampini PetscInt m,n; 2078ccdfe979SStefano Zampini PetscBool cisdense,flg; 2079ccdfe979SStefano Zampini PetscErrorCode ierr; 2080ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2081ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2082ccdfe979SStefano Zampini 2083ccdfe979SStefano Zampini PetscFunctionBegin; 2084ccdfe979SStefano Zampini MatCheckProduct(C,1); 2085ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2086ccdfe979SStefano Zampini A = product->A; 2087ccdfe979SStefano Zampini B = product->B; 2088ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2089ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2090ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2091ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2092ccdfe979SStefano Zampini switch (product->type) { 2093ccdfe979SStefano Zampini case MATPRODUCT_AB: 2094ccdfe979SStefano Zampini m = A->rmap->n; 2095ccdfe979SStefano Zampini n = B->cmap->n; 2096ccdfe979SStefano Zampini break; 2097ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2098ccdfe979SStefano Zampini m = A->cmap->n; 2099ccdfe979SStefano Zampini n = B->cmap->n; 2100ccdfe979SStefano Zampini break; 2101ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2102ccdfe979SStefano Zampini m = A->rmap->n; 2103ccdfe979SStefano Zampini n = B->rmap->n; 2104ccdfe979SStefano Zampini break; 2105ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2106ccdfe979SStefano Zampini m = B->cmap->n; 2107ccdfe979SStefano Zampini n = B->cmap->n; 2108ccdfe979SStefano Zampini break; 2109ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2110ccdfe979SStefano Zampini m = B->rmap->n; 2111ccdfe979SStefano Zampini n = B->rmap->n; 2112ccdfe979SStefano Zampini break; 2113ccdfe979SStefano Zampini default: 2114ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2115ccdfe979SStefano Zampini } 2116ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2117ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2118ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2119ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2120ccdfe979SStefano Zampini 2121ccdfe979SStefano Zampini /* product data */ 2122ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2123ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2125afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2126ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2127afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2128ccdfe979SStefano Zampini } 2129afb2bd1cSJunchao Zhang #endif 2130ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2131ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2132ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2133ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2134ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2135ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2136ccdfe979SStefano Zampini } else { 2137ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2138ccdfe979SStefano Zampini } 2139ccdfe979SStefano Zampini } 2140ccdfe979SStefano Zampini C->product->data = mmdata; 2141ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2142ccdfe979SStefano Zampini 2143ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2144ccdfe979SStefano Zampini PetscFunctionReturn(0); 2145ccdfe979SStefano Zampini } 2146ccdfe979SStefano Zampini 2147ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2148ccdfe979SStefano Zampini 2149ccdfe979SStefano Zampini /* handles dense B */ 2150ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2151ccdfe979SStefano Zampini { 2152ccdfe979SStefano Zampini Mat_Product *product = C->product; 2153ccdfe979SStefano Zampini PetscErrorCode ierr; 2154ccdfe979SStefano Zampini 2155ccdfe979SStefano Zampini PetscFunctionBegin; 2156ccdfe979SStefano Zampini MatCheckProduct(C,1); 2157ccdfe979SStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2158ccdfe979SStefano Zampini if (product->A->boundtocpu) { 2159ccdfe979SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2160ccdfe979SStefano Zampini PetscFunctionReturn(0); 2161ccdfe979SStefano Zampini } 2162ccdfe979SStefano Zampini switch (product->type) { 2163ccdfe979SStefano Zampini case MATPRODUCT_AB: 2164ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2165ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2166ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2167ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2168ccdfe979SStefano Zampini C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2169ccdfe979SStefano Zampini default: 2170ccdfe979SStefano Zampini break; 2171ccdfe979SStefano Zampini } 2172ccdfe979SStefano Zampini PetscFunctionReturn(0); 2173ccdfe979SStefano Zampini } 2174ccdfe979SStefano Zampini 21756fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 21769ae82921SPaul Mullowney { 2177b175d8bbSPaul Mullowney PetscErrorCode ierr; 21789ae82921SPaul Mullowney 21799ae82921SPaul Mullowney PetscFunctionBegin; 2180e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2181e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2182e6e9a74fSStefano Zampini } 2183e6e9a74fSStefano Zampini 2184e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2185e6e9a74fSStefano Zampini { 2186e6e9a74fSStefano Zampini PetscErrorCode ierr; 2187e6e9a74fSStefano Zampini 2188e6e9a74fSStefano Zampini PetscFunctionBegin; 2189e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2190e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2191e6e9a74fSStefano Zampini } 2192e6e9a74fSStefano Zampini 2193e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2194e6e9a74fSStefano Zampini { 2195e6e9a74fSStefano Zampini PetscErrorCode ierr; 2196e6e9a74fSStefano Zampini 2197e6e9a74fSStefano Zampini PetscFunctionBegin; 2198e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2199e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2200e6e9a74fSStefano Zampini } 2201e6e9a74fSStefano Zampini 2202e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2203e6e9a74fSStefano Zampini { 2204e6e9a74fSStefano Zampini PetscErrorCode ierr; 2205e6e9a74fSStefano Zampini 2206e6e9a74fSStefano Zampini PetscFunctionBegin; 2207e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 22089ae82921SPaul Mullowney PetscFunctionReturn(0); 22099ae82921SPaul Mullowney } 22109ae82921SPaul Mullowney 22116fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2212ca45077fSPaul Mullowney { 2213b175d8bbSPaul Mullowney PetscErrorCode ierr; 2214ca45077fSPaul Mullowney 2215ca45077fSPaul Mullowney PetscFunctionBegin; 2216e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2217ca45077fSPaul Mullowney PetscFunctionReturn(0); 2218ca45077fSPaul Mullowney } 2219ca45077fSPaul Mullowney 2220afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2221e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 22229ae82921SPaul Mullowney { 22239ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2224aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 22259ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2226e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2227b175d8bbSPaul Mullowney PetscErrorCode ierr; 222857d48284SJunchao Zhang cudaError_t cerr; 2229aa372e3fSPaul Mullowney cusparseStatus_t stat; 2230e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2231e6e9a74fSStefano Zampini PetscBool compressed; 2232afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2233afb2bd1cSJunchao Zhang PetscInt nx,ny; 2234afb2bd1cSJunchao Zhang #endif 22356e111a19SKarl Rupp 22369ae82921SPaul Mullowney PetscFunctionBegin; 2237e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2238e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2239afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2240d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2241e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2242e6e9a74fSStefano Zampini } 224334d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 224434d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2245e6e9a74fSStefano Zampini if (!trans) { 22469ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2247c9567895SMark if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2248e6e9a74fSStefano Zampini } else { 2249e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 2250e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2251e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2252e6e9a74fSStefano Zampini } else { 2253afb2bd1cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2254e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2255e6e9a74fSStefano Zampini } 2256e6e9a74fSStefano Zampini } 2257e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2258e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2259213423ffSJunchao Zhang 2260e6e9a74fSStefano Zampini try { 2261e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2262213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2263213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2264afb2bd1cSJunchao Zhang 226585ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2266e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2267afb2bd1cSJunchao Zhang /* z = A x + beta y. 2268afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2269afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2270afb2bd1cSJunchao Zhang */ 2271e6e9a74fSStefano Zampini xptr = xarray; 2272afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2273213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2274afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2275afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2276afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2277afb2bd1cSJunchao Zhang */ 2278afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2279afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2280afb2bd1cSJunchao Zhang nx = mat->num_cols; 2281afb2bd1cSJunchao Zhang ny = mat->num_rows; 2282afb2bd1cSJunchao Zhang } 2283afb2bd1cSJunchao Zhang #endif 2284e6e9a74fSStefano Zampini } else { 2285afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2286afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2287afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2288afb2bd1cSJunchao Zhang */ 2289afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2290e6e9a74fSStefano Zampini dptr = zarray; 2291e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2292afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2293e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2294e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2295e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2296e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2297e6e9a74fSStefano Zampini } 2298afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2299afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2300afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2301afb2bd1cSJunchao Zhang nx = mat->num_rows; 2302afb2bd1cSJunchao Zhang ny = mat->num_cols; 2303afb2bd1cSJunchao Zhang } 2304afb2bd1cSJunchao Zhang #endif 2305e6e9a74fSStefano Zampini } 23069ae82921SPaul Mullowney 2307afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2308aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2309afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2310afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2311afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2312afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2313afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2314afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2315afb2bd1cSJunchao Zhang matstruct->matDescr, 2316afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2317afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2318afb2bd1cSJunchao Zhang cusparse_scalartype, 2319afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2320afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2321afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2322afb2bd1cSJunchao Zhang 2323afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2324afb2bd1cSJunchao Zhang } else { 2325afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2326afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2327afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2328afb2bd1cSJunchao Zhang } 2329afb2bd1cSJunchao Zhang 2330afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2331afb2bd1cSJunchao Zhang matstruct->alpha_one, 2332afb2bd1cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2333afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2334afb2bd1cSJunchao Zhang beta, 2335afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2336afb2bd1cSJunchao Zhang cusparse_scalartype, 2337afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2338afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2339afb2bd1cSJunchao Zhang #else 23407656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2341e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2342a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2343afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2344aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2345e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 234657d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2347afb2bd1cSJunchao Zhang #endif 2348aa372e3fSPaul Mullowney } else { 2349213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2350afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2351afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2352afb2bd1cSJunchao Zhang #else 2353301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2354e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2355afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2356e6e9a74fSStefano Zampini xptr, beta, 235757d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2358afb2bd1cSJunchao Zhang #endif 2359a65300a6SPaul Mullowney } 2360aa372e3fSPaul Mullowney } 236105035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2362958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2363aa372e3fSPaul Mullowney 2364e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2365213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2366213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2367213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2368e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2369213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 23707656d835SStefano Zampini } 2371213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2372c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 23737656d835SStefano Zampini } 23747656d835SStefano Zampini 2375213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2376213423ffSJunchao Zhang if (compressed) { 2377213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2378e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2379c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2380e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2381c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 238205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2383958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2384e6e9a74fSStefano Zampini } 2385e6e9a74fSStefano Zampini } else { 2386e6e9a74fSStefano Zampini if (yy && yy != zz) { 2387e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2388e6e9a74fSStefano Zampini } 2389e6e9a74fSStefano Zampini } 2390e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2391213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2392213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 23939ae82921SPaul Mullowney } catch(char *ex) { 23949ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 23959ae82921SPaul Mullowney } 2396e6e9a74fSStefano Zampini if (yy) { 2397958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2398e6e9a74fSStefano Zampini } else { 2399e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2400e6e9a74fSStefano Zampini } 24019ae82921SPaul Mullowney PetscFunctionReturn(0); 24029ae82921SPaul Mullowney } 24039ae82921SPaul Mullowney 24046fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2405ca45077fSPaul Mullowney { 2406b175d8bbSPaul Mullowney PetscErrorCode ierr; 24076e111a19SKarl Rupp 2408ca45077fSPaul Mullowney PetscFunctionBegin; 2409e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2410ca45077fSPaul Mullowney PetscFunctionReturn(0); 2411ca45077fSPaul Mullowney } 2412ca45077fSPaul Mullowney 24136fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 24149ae82921SPaul Mullowney { 24159ae82921SPaul Mullowney PetscErrorCode ierr; 24163fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 24173fa6b06aSMark Adams PetscBool is_seq = PETSC_TRUE; 24183fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 24199ae82921SPaul Mullowney PetscFunctionBegin; 2420bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 24213fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2422bc3f50f2SPaul Mullowney } 24233fa6b06aSMark Adams if (d_mat) { 24243fa6b06aSMark Adams cudaError_t err; 24253fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 24263fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 24273fa6b06aSMark Adams nnz_state = h_mat.nonzerostate; 24283fa6b06aSMark Adams is_seq = h_mat.seq; 24293fa6b06aSMark Adams } 24303fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 24313fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 24323fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 24333fa6b06aSMark Adams ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 24343fa6b06aSMark Adams } else if (nnz_state > A->nonzerostate) { 24353fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 24363fa6b06aSMark Adams } 24373fa6b06aSMark Adams 24389ae82921SPaul Mullowney PetscFunctionReturn(0); 24399ae82921SPaul Mullowney } 24409ae82921SPaul Mullowney 24419ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2442e057df02SPaul Mullowney /*@ 24439ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2444e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2445e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2446e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2447e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2448e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 24499ae82921SPaul Mullowney 2450d083f849SBarry Smith Collective 24519ae82921SPaul Mullowney 24529ae82921SPaul Mullowney Input Parameters: 24539ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 24549ae82921SPaul Mullowney . m - number of rows 24559ae82921SPaul Mullowney . n - number of columns 24569ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 24579ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 24580298fd71SBarry Smith (possibly different for each row) or NULL 24599ae82921SPaul Mullowney 24609ae82921SPaul Mullowney Output Parameter: 24619ae82921SPaul Mullowney . A - the matrix 24629ae82921SPaul Mullowney 24639ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 24649ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 24659ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 24669ae82921SPaul Mullowney 24679ae82921SPaul Mullowney Notes: 24689ae82921SPaul Mullowney If nnz is given then nz is ignored 24699ae82921SPaul Mullowney 24709ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 24719ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 24729ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 24739ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 24749ae82921SPaul Mullowney 24759ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 24760298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 24779ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 24789ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 24799ae82921SPaul Mullowney 24809ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 24819ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 24829ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 24839ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 24849ae82921SPaul Mullowney 24859ae82921SPaul Mullowney Level: intermediate 24869ae82921SPaul Mullowney 2487e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 24889ae82921SPaul Mullowney @*/ 24899ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 24909ae82921SPaul Mullowney { 24919ae82921SPaul Mullowney PetscErrorCode ierr; 24929ae82921SPaul Mullowney 24939ae82921SPaul Mullowney PetscFunctionBegin; 24949ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 24959ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 24969ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 24979ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 24989ae82921SPaul Mullowney PetscFunctionReturn(0); 24999ae82921SPaul Mullowney } 25009ae82921SPaul Mullowney 25016fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 25029ae82921SPaul Mullowney { 25039ae82921SPaul Mullowney PetscErrorCode ierr; 25043fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2505ab25e6cbSDominic Meiser 25069ae82921SPaul Mullowney PetscFunctionBegin; 25079ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 25083fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 25093fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2510470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 25119ae82921SPaul Mullowney } else { 2512470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2513aa372e3fSPaul Mullowney } 25143fa6b06aSMark Adams if (d_mat) { 25153fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 25163fa6b06aSMark Adams cudaError_t err; 25173fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 25183fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 25193fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 25203fa6b06aSMark Adams if (h_mat.seq) { 25213fa6b06aSMark Adams if (a->compressedrow.use) { 25223fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 25233fa6b06aSMark Adams } 25243fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 25253fa6b06aSMark Adams } 25263fa6b06aSMark Adams } 2527ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2528ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2529ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2530ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 25317e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 25327e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 25339ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 25349ae82921SPaul Mullowney PetscFunctionReturn(0); 25359ae82921SPaul Mullowney } 25369ae82921SPaul Mullowney 2537ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 253895639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 25399ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 25409ff858a8SKarl Rupp { 25419ff858a8SKarl Rupp PetscErrorCode ierr; 25429ff858a8SKarl Rupp 25439ff858a8SKarl Rupp PetscFunctionBegin; 25449ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2545ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 25469ff858a8SKarl Rupp PetscFunctionReturn(0); 25479ff858a8SKarl Rupp } 25489ff858a8SKarl Rupp 254995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 255095639643SRichard Tran Mills { 2551c58ef05eSStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2552e6e9a74fSStefano Zampini PetscErrorCode ierr; 2553e6e9a74fSStefano Zampini 255495639643SRichard Tran Mills PetscFunctionBegin; 2555e6e9a74fSStefano Zampini if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 255695639643SRichard Tran Mills if (flg) { 25577e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 25587e8381f9SStefano Zampini 255995639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJ; 256095639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJ; 2561c34f1ff0SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2562c34f1ff0SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2563e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = NULL; 2564e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = NULL; 2565e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2566e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 25677e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 25687e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 25697e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 257095639643SRichard Tran Mills } else { 257195639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJCUSPARSE; 257295639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 257395639643SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 257495639643SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2575e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2576e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2577e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2578e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 25797e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 25807e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 25817e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 258295639643SRichard Tran Mills } 258395639643SRichard Tran Mills A->boundtocpu = flg; 2584c58ef05eSStefano Zampini a->inode.use = flg; 258595639643SRichard Tran Mills PetscFunctionReturn(0); 258695639643SRichard Tran Mills } 258795639643SRichard Tran Mills 25883fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 25893fa6b06aSMark Adams { 25903fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 25913fa6b06aSMark Adams PetscErrorCode ierr; 25927e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 25937e8381f9SStefano Zampini 25943fa6b06aSMark Adams PetscFunctionBegin; 25953fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 25963fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 25977e8381f9SStefano Zampini if (spptr->mat) { 25987e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 25997e8381f9SStefano Zampini if (matrix->values) { 26007e8381f9SStefano Zampini both = PETSC_TRUE; 26017e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 26027e8381f9SStefano Zampini } 26037e8381f9SStefano Zampini } 26047e8381f9SStefano Zampini if (spptr->matTranspose) { 26057e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 26067e8381f9SStefano Zampini if (matrix->values) { 26077e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 26087e8381f9SStefano Zampini } 26097e8381f9SStefano Zampini } 26103fa6b06aSMark Adams d_mat = spptr->deviceMat; 26113fa6b06aSMark Adams } 26123fa6b06aSMark Adams if (d_mat) { 26133fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 26143fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz = a->i[n]; 26153fa6b06aSMark Adams cudaError_t err; 26163fa6b06aSMark Adams PetscScalar *vals; 26173fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 26183fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 26193fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 26203fa6b06aSMark Adams } 26213fa6b06aSMark Adams ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 26227e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 26233fa6b06aSMark Adams 26243fa6b06aSMark Adams PetscFunctionReturn(0); 26253fa6b06aSMark Adams } 26263fa6b06aSMark Adams 262749735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 26289ae82921SPaul Mullowney { 26299ae82921SPaul Mullowney PetscErrorCode ierr; 2630aa372e3fSPaul Mullowney cusparseStatus_t stat; 263149735bf3SStefano Zampini Mat B; 26329ae82921SPaul Mullowney 26339ae82921SPaul Mullowney PetscFunctionBegin; 263449735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 263549735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 263649735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 263749735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 263849735bf3SStefano Zampini } 263949735bf3SStefano Zampini B = *newmat; 264049735bf3SStefano Zampini 264134136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 264234136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 264334136279SStefano Zampini 264449735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 26459ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 2646e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 2647e6e9a74fSStefano Zampini 2648e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2649e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 2650e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2651e6e9a74fSStefano Zampini B->spptr = spptr; 26523fa6b06aSMark Adams spptr->deviceMat = NULL; 26539ae82921SPaul Mullowney } else { 2654e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 2655e6e9a74fSStefano Zampini 2656e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2657e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2658e6e9a74fSStefano Zampini B->spptr = spptr; 26599ae82921SPaul Mullowney } 2660e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 266149735bf3SStefano Zampini } 2662693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 26639ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 26649ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 266595639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2666693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 26673fa6b06aSMark Adams B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 26682205254eSKarl Rupp 2669e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 26709ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2671bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 26729ae82921SPaul Mullowney PetscFunctionReturn(0); 26739ae82921SPaul Mullowney } 26749ae82921SPaul Mullowney 267502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 267602fe1965SBarry Smith { 267702fe1965SBarry Smith PetscErrorCode ierr; 267802fe1965SBarry Smith 267902fe1965SBarry Smith PetscFunctionBegin; 268005035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 268102fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 26820ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2683afb2bd1cSJunchao Zhang ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2684afb2bd1cSJunchao Zhang ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2685afb2bd1cSJunchao Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 268602fe1965SBarry Smith PetscFunctionReturn(0); 268702fe1965SBarry Smith } 268802fe1965SBarry Smith 26893ca39a21SBarry Smith /*MC 2690e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2691e057df02SPaul Mullowney 2692e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 26932692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 26942692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2695e057df02SPaul Mullowney 2696e057df02SPaul Mullowney Options Database Keys: 2697e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2698aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2699a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2700e057df02SPaul Mullowney 2701e057df02SPaul Mullowney Level: beginner 2702e057df02SPaul Mullowney 27038468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2704e057df02SPaul Mullowney M*/ 27057f756511SDominic Meiser 270642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 270742c9c57cSBarry Smith 27080f39cd5aSBarry Smith 27093ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 271042c9c57cSBarry Smith { 271142c9c57cSBarry Smith PetscErrorCode ierr; 271242c9c57cSBarry Smith 271342c9c57cSBarry Smith PetscFunctionBegin; 27143ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 27153ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 27163ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 27173ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 271842c9c57cSBarry Smith PetscFunctionReturn(0); 271942c9c57cSBarry Smith } 272029b38603SBarry Smith 2721470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 27227f756511SDominic Meiser { 2723e6e9a74fSStefano Zampini PetscErrorCode ierr; 27247f756511SDominic Meiser cusparseStatus_t stat; 27257f756511SDominic Meiser 27267f756511SDominic Meiser PetscFunctionBegin; 27277f756511SDominic Meiser if (*cusparsestruct) { 2728e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2729e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 27307f756511SDominic Meiser delete (*cusparsestruct)->workVector; 273181902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 27327e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 27337e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 27347e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_v; 27357e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_w; 27367e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2737afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2738afb2bd1cSJunchao Zhang cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2739afb2bd1cSJunchao Zhang #endif 2740e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 27417f756511SDominic Meiser } 27427f756511SDominic Meiser PetscFunctionReturn(0); 27437f756511SDominic Meiser } 27447f756511SDominic Meiser 27457f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 27467f756511SDominic Meiser { 27477f756511SDominic Meiser PetscFunctionBegin; 27487f756511SDominic Meiser if (*mat) { 27497f756511SDominic Meiser delete (*mat)->values; 27507f756511SDominic Meiser delete (*mat)->column_indices; 27517f756511SDominic Meiser delete (*mat)->row_offsets; 27527f756511SDominic Meiser delete *mat; 27537f756511SDominic Meiser *mat = 0; 27547f756511SDominic Meiser } 27557f756511SDominic Meiser PetscFunctionReturn(0); 27567f756511SDominic Meiser } 27577f756511SDominic Meiser 2758470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 27597f756511SDominic Meiser { 27607f756511SDominic Meiser cusparseStatus_t stat; 27617f756511SDominic Meiser PetscErrorCode ierr; 27627f756511SDominic Meiser 27637f756511SDominic Meiser PetscFunctionBegin; 27647f756511SDominic Meiser if (*trifactor) { 276557d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2766afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 27677f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 27681b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2769afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 27701b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2771afb2bd1cSJunchao Zhang #endif 2772*da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 27737f756511SDominic Meiser } 27747f756511SDominic Meiser PetscFunctionReturn(0); 27757f756511SDominic Meiser } 27767f756511SDominic Meiser 2777470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 27787f756511SDominic Meiser { 27797f756511SDominic Meiser CsrMatrix *mat; 27807f756511SDominic Meiser cusparseStatus_t stat; 27817f756511SDominic Meiser cudaError_t err; 27827f756511SDominic Meiser 27837f756511SDominic Meiser PetscFunctionBegin; 27847f756511SDominic Meiser if (*matstruct) { 27857f756511SDominic Meiser if ((*matstruct)->mat) { 27867f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2787afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2788afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2789afb2bd1cSJunchao Zhang #else 27907f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 279157d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2792afb2bd1cSJunchao Zhang #endif 27937f756511SDominic Meiser } else { 27947f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 27957f756511SDominic Meiser CsrMatrix_Destroy(&mat); 27967f756511SDominic Meiser } 27977f756511SDominic Meiser } 279857d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 27997f756511SDominic Meiser delete (*matstruct)->cprowIndices; 2800afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 28017656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 28027656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2803afb2bd1cSJunchao Zhang 2804afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2805afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2806afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2807afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 2808afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 2809afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2810afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2811afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2812afb2bd1cSJunchao Zhang } 2813afb2bd1cSJunchao Zhang } 2814afb2bd1cSJunchao Zhang #endif 28157f756511SDominic Meiser delete *matstruct; 28167e8381f9SStefano Zampini *matstruct = NULL; 28177f756511SDominic Meiser } 28187f756511SDominic Meiser PetscFunctionReturn(0); 28197f756511SDominic Meiser } 28207f756511SDominic Meiser 2821ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 28227f756511SDominic Meiser { 2823e6e9a74fSStefano Zampini PetscErrorCode ierr; 2824e6e9a74fSStefano Zampini 28257f756511SDominic Meiser PetscFunctionBegin; 28267f756511SDominic Meiser if (*trifactors) { 2827e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2828e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2829e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2830e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 28317f756511SDominic Meiser delete (*trifactors)->rpermIndices; 28327f756511SDominic Meiser delete (*trifactors)->cpermIndices; 28337f756511SDominic Meiser delete (*trifactors)->workVector; 28347e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 28357e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 28367e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 2837ccdfe979SStefano Zampini } 2838ccdfe979SStefano Zampini PetscFunctionReturn(0); 2839ccdfe979SStefano Zampini } 2840ccdfe979SStefano Zampini 2841ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2842ccdfe979SStefano Zampini { 2843e6e9a74fSStefano Zampini PetscErrorCode ierr; 2844ccdfe979SStefano Zampini cusparseHandle_t handle; 2845ccdfe979SStefano Zampini cusparseStatus_t stat; 2846ccdfe979SStefano Zampini 2847ccdfe979SStefano Zampini PetscFunctionBegin; 2848ccdfe979SStefano Zampini if (*trifactors) { 2849e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 28507f756511SDominic Meiser if (handle = (*trifactors)->handle) { 285157d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 28527f756511SDominic Meiser } 2853e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 28547f756511SDominic Meiser } 28557f756511SDominic Meiser PetscFunctionReturn(0); 28567f756511SDominic Meiser } 28577e8381f9SStefano Zampini 28587e8381f9SStefano Zampini struct IJCompare 28597e8381f9SStefano Zampini { 28607e8381f9SStefano Zampini __host__ __device__ 28617e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 28627e8381f9SStefano Zampini { 28637e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 28647e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 28657e8381f9SStefano Zampini return false; 28667e8381f9SStefano Zampini } 28677e8381f9SStefano Zampini }; 28687e8381f9SStefano Zampini 28697e8381f9SStefano Zampini struct IJEqual 28707e8381f9SStefano Zampini { 28717e8381f9SStefano Zampini __host__ __device__ 28727e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 28737e8381f9SStefano Zampini { 28747e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 28757e8381f9SStefano Zampini return true; 28767e8381f9SStefano Zampini } 28777e8381f9SStefano Zampini }; 28787e8381f9SStefano Zampini 28797e8381f9SStefano Zampini struct IJDiff 28807e8381f9SStefano Zampini { 28817e8381f9SStefano Zampini __host__ __device__ 28827e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 28837e8381f9SStefano Zampini { 28847e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 28857e8381f9SStefano Zampini } 28867e8381f9SStefano Zampini }; 28877e8381f9SStefano Zampini 28887e8381f9SStefano Zampini struct IJSum 28897e8381f9SStefano Zampini { 28907e8381f9SStefano Zampini __host__ __device__ 28917e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 28927e8381f9SStefano Zampini { 28937e8381f9SStefano Zampini return t1||t2; 28947e8381f9SStefano Zampini } 28957e8381f9SStefano Zampini }; 28967e8381f9SStefano Zampini 28977e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 28987e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 28997e8381f9SStefano Zampini { 29007e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 29017e8381f9SStefano Zampini CsrMatrix *matrix; 29027e8381f9SStefano Zampini PetscErrorCode ierr; 29037e8381f9SStefano Zampini cudaError_t cerr; 29047e8381f9SStefano Zampini PetscInt n; 29057e8381f9SStefano Zampini 29067e8381f9SStefano Zampini PetscFunctionBegin; 29077e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 29087e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 29097e8381f9SStefano Zampini if (!cusp->cooPerm) { 29107e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 29117e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 29127e8381f9SStefano Zampini PetscFunctionReturn(0); 29137e8381f9SStefano Zampini } 29147e8381f9SStefano Zampini n = cusp->cooPerm->size(); 29157e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 29167e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 29177e8381f9SStefano Zampini if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 29187e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 29197e8381f9SStefano Zampini if (v) { 29207e8381f9SStefano Zampini cusp->cooPerm_v->assign(v,v+n); 29217e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 29227e8381f9SStefano Zampini } 29237e8381f9SStefano Zampini else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 29247e8381f9SStefano Zampini if (imode == ADD_VALUES) { 29257e8381f9SStefano Zampini if (cusp->cooPerm_a) { 29267e8381f9SStefano Zampini if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 29277e8381f9SStefano Zampini auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 29287e8381f9SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 29297e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 29307e8381f9SStefano Zampini } else { 29317e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 29327e8381f9SStefano Zampini matrix->values->begin())); 29337e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 29347e8381f9SStefano Zampini matrix->values->end())); 29357e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 29367e8381f9SStefano Zampini } 29377e8381f9SStefano Zampini } else { 29387e8381f9SStefano Zampini if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 29397e8381f9SStefano Zampini if we are inserting two different values into the same location */ 29407e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 29417e8381f9SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 29427e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 29437e8381f9SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 29447e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 29457e8381f9SStefano Zampini } else { 29467e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 29477e8381f9SStefano Zampini matrix->values->begin())); 29487e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 29497e8381f9SStefano Zampini matrix->values->end())); 29507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 29517e8381f9SStefano Zampini } 29527e8381f9SStefano Zampini } 29537e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 29547e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 29557e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 29567e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 29577e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 29587e8381f9SStefano Zampini /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 29597e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 29607e8381f9SStefano Zampini PetscFunctionReturn(0); 29617e8381f9SStefano Zampini } 29627e8381f9SStefano Zampini 29637e8381f9SStefano Zampini #include <thrust/binary_search.h> 29647e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 29657e8381f9SStefano Zampini { 29667e8381f9SStefano Zampini PetscErrorCode ierr; 29677e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 29687e8381f9SStefano Zampini CsrMatrix *matrix; 29697e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 29707e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 29717e8381f9SStefano Zampini cudaError_t cerr; 29727e8381f9SStefano Zampini 29737e8381f9SStefano Zampini PetscFunctionBegin; 29747e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 29757e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 29767e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 29777e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 29787e8381f9SStefano Zampini if (n != cooPerm_n) { 29797e8381f9SStefano Zampini delete cusp->cooPerm; 29807e8381f9SStefano Zampini delete cusp->cooPerm_v; 29817e8381f9SStefano Zampini delete cusp->cooPerm_w; 29827e8381f9SStefano Zampini delete cusp->cooPerm_a; 29837e8381f9SStefano Zampini cusp->cooPerm = NULL; 29847e8381f9SStefano Zampini cusp->cooPerm_v = NULL; 29857e8381f9SStefano Zampini cusp->cooPerm_w = NULL; 29867e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 29877e8381f9SStefano Zampini } 29887e8381f9SStefano Zampini if (n) { 29897e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 29907e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 29917e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 29927e8381f9SStefano Zampini 29937e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 29947e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 29957e8381f9SStefano Zampini 29967e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 29977e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 29987e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 29997e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 30007e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 30017e8381f9SStefano Zampini 30027e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 30037e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 30047e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 30057e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 30067e8381f9SStefano Zampini 30077e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 30087e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 30097e8381f9SStefano Zampini delete cusp->cooPerm_a; 30107e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 30117e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 30127e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 30137e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 30147e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 30157e8381f9SStefano Zampini w[0] = 0; 30167e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 30177e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 30187e8381f9SStefano Zampini } 30197e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 30207e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 30217e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 30227e8381f9SStefano Zampini ii.begin()); 30237e8381f9SStefano Zampini 30247e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 30257e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 30267e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 30277e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 30287e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 30297e8381f9SStefano Zampini a->i[0] = 0; 30307e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 30317e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 30327e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 30337e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 30347e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 30357e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 30367e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 30377e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 30387e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 30397e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 30407e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 30417e8381f9SStefano Zampini } 30427e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 30437e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 30447e8381f9SStefano Zampini } else { 30457e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 30467e8381f9SStefano Zampini } 30477e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 30487e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 30497e8381f9SStefano Zampini 30507e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 30517e8381f9SStefano Zampini The code is so convoluted now that I prefer to copy garbage to the GPU */ 30527e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 30537e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 30547e8381f9SStefano Zampini A->nonzerostate++; 30557e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 30567e8381f9SStefano Zampini { 30577e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 30587e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 30597e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 30607e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 30617e8381f9SStefano Zampini } 30627e8381f9SStefano Zampini 30637e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 30647e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 30657e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 30667e8381f9SStefano Zampini PetscFunctionReturn(0); 30677e8381f9SStefano Zampini } 3068