19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 686fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 759ae82921SPaul Mullowney 767f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 827f756511SDominic Meiser 837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 847e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 857e8381f9SStefano Zampini 86b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 87b06137fdSPaul Mullowney { 88b06137fdSPaul Mullowney cusparseStatus_t stat; 89b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 90b06137fdSPaul Mullowney 91b06137fdSPaul Mullowney PetscFunctionBegin; 92d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 93b06137fdSPaul Mullowney cusparsestruct->stream = stream; 9457d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 95b06137fdSPaul Mullowney PetscFunctionReturn(0); 96b06137fdSPaul Mullowney } 97b06137fdSPaul Mullowney 98b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 99b06137fdSPaul Mullowney { 100b06137fdSPaul Mullowney cusparseStatus_t stat; 101b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102b06137fdSPaul Mullowney 103b06137fdSPaul Mullowney PetscFunctionBegin; 104d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1056b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 10616a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 10757d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 10816a2e217SAlejandro Lamas Daviña } 109b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1106b1cf21dSAlejandro Lamas Daviña } 11157d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 112b06137fdSPaul Mullowney PetscFunctionReturn(0); 113b06137fdSPaul Mullowney } 114b06137fdSPaul Mullowney 115b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 116b06137fdSPaul Mullowney { 117b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1187e8381f9SStefano Zampini PetscBool flg; 1197e8381f9SStefano Zampini PetscErrorCode ierr; 120ccdfe979SStefano Zampini 121b06137fdSPaul Mullowney PetscFunctionBegin; 1227e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1237e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 124ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 125b06137fdSPaul Mullowney PetscFunctionReturn(0); 126b06137fdSPaul Mullowney } 127b06137fdSPaul Mullowney 128ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1299ae82921SPaul Mullowney { 1309ae82921SPaul Mullowney PetscFunctionBegin; 1319ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1329ae82921SPaul Mullowney PetscFunctionReturn(0); 1339ae82921SPaul Mullowney } 1349ae82921SPaul Mullowney 135c708e6cdSJed Brown /*MC 136087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 137087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 138087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 139087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 140087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 141087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 142c708e6cdSJed Brown 1439ae82921SPaul Mullowney Level: beginner 144c708e6cdSJed Brown 1453ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 146c708e6cdSJed Brown M*/ 1479ae82921SPaul Mullowney 14842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1499ae82921SPaul Mullowney { 1509ae82921SPaul Mullowney PetscErrorCode ierr; 151bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1529ae82921SPaul Mullowney 1539ae82921SPaul Mullowney PetscFunctionBegin; 154bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 155bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1562c7c0729SBarry Smith (*B)->factortype = ftype; 1572c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1589ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1592205254eSKarl Rupp 160087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16133d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1629ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1639ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 164087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 165087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 166087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1679ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 168bc3f50f2SPaul Mullowney 169fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1703ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1719ae82921SPaul Mullowney PetscFunctionReturn(0); 1729ae82921SPaul Mullowney } 1739ae82921SPaul Mullowney 174bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 175ca45077fSPaul Mullowney { 176aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1776e111a19SKarl Rupp 178ca45077fSPaul Mullowney PetscFunctionBegin; 179ca45077fSPaul Mullowney switch (op) { 180e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 181aa372e3fSPaul Mullowney cusparsestruct->format = format; 182ca45077fSPaul Mullowney break; 183e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 184aa372e3fSPaul Mullowney cusparsestruct->format = format; 185ca45077fSPaul Mullowney break; 186ca45077fSPaul Mullowney default: 18736d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 188ca45077fSPaul Mullowney } 189ca45077fSPaul Mullowney PetscFunctionReturn(0); 190ca45077fSPaul Mullowney } 1919ae82921SPaul Mullowney 192e057df02SPaul Mullowney /*@ 193e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 194e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 195aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 196e057df02SPaul Mullowney Not Collective 197e057df02SPaul Mullowney 198e057df02SPaul Mullowney Input Parameters: 1998468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 20036d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2012692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 202e057df02SPaul Mullowney 203e057df02SPaul Mullowney Output Parameter: 204e057df02SPaul Mullowney 205e057df02SPaul Mullowney Level: intermediate 206e057df02SPaul Mullowney 2078468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 208e057df02SPaul Mullowney @*/ 209e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 210e057df02SPaul Mullowney { 211e057df02SPaul Mullowney PetscErrorCode ierr; 2126e111a19SKarl Rupp 213e057df02SPaul Mullowney PetscFunctionBegin; 214e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 215e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 216e057df02SPaul Mullowney PetscFunctionReturn(0); 217e057df02SPaul Mullowney } 218e057df02SPaul Mullowney 219e6e9a74fSStefano Zampini /*@ 220e6e9a74fSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 221e6e9a74fSStefano Zampini 222e6e9a74fSStefano Zampini Collective on mat 223e6e9a74fSStefano Zampini 224e6e9a74fSStefano Zampini Input Parameters: 225e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 226e6e9a74fSStefano Zampini - transgen - the boolean flag 227e6e9a74fSStefano Zampini 228e6e9a74fSStefano Zampini Level: intermediate 229e6e9a74fSStefano Zampini 230e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE 231e6e9a74fSStefano Zampini @*/ 232e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 233e6e9a74fSStefano Zampini { 234e6e9a74fSStefano Zampini PetscErrorCode ierr; 235e6e9a74fSStefano Zampini PetscBool flg; 236e6e9a74fSStefano Zampini 237e6e9a74fSStefano Zampini PetscFunctionBegin; 238e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 239e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 240e6e9a74fSStefano Zampini if (flg) { 241e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 24254da937aSStefano Zampini 243e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 244e6e9a74fSStefano Zampini cusp->transgen = transgen; 24554da937aSStefano Zampini if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 24654da937aSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 24754da937aSStefano Zampini } 248e6e9a74fSStefano Zampini } 249e6e9a74fSStefano Zampini PetscFunctionReturn(0); 250e6e9a74fSStefano Zampini } 251e6e9a74fSStefano Zampini 2524416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2539ae82921SPaul Mullowney { 2549ae82921SPaul Mullowney PetscErrorCode ierr; 255e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2569ae82921SPaul Mullowney PetscBool flg; 257a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2586e111a19SKarl Rupp 2599ae82921SPaul Mullowney PetscFunctionBegin; 260e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2619ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 26254da937aSStefano Zampini PetscBool transgen = cusparsestruct->transgen; 26354da937aSStefano Zampini 26454da937aSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 265afb2bd1cSJunchao Zhang if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 266afb2bd1cSJunchao Zhang 267e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 268a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 269afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 270afb2bd1cSJunchao Zhang 2714c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 272a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 273afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 274afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 275afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 276afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 277afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 278afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 279afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 280afb2bd1cSJunchao Zhang 281afb2bd1cSJunchao Zhang cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 282afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 283afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 284afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 285afb2bd1cSJunchao Zhang 286afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 287afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 288afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 289afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 290afb2bd1cSJunchao Zhang #endif 2914c87dfd4SPaul Mullowney } 2920af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2939ae82921SPaul Mullowney PetscFunctionReturn(0); 2949ae82921SPaul Mullowney } 2959ae82921SPaul Mullowney 2966fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2979ae82921SPaul Mullowney { 298da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2999ae82921SPaul Mullowney PetscErrorCode ierr; 3009ae82921SPaul Mullowney 3019ae82921SPaul Mullowney PetscFunctionBegin; 302da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3039ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3049ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3059ae82921SPaul Mullowney PetscFunctionReturn(0); 3069ae82921SPaul Mullowney } 3079ae82921SPaul Mullowney 3086fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3099ae82921SPaul Mullowney { 310da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3119ae82921SPaul Mullowney PetscErrorCode ierr; 3129ae82921SPaul Mullowney 3139ae82921SPaul Mullowney PetscFunctionBegin; 314da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3159ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3169ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3179ae82921SPaul Mullowney PetscFunctionReturn(0); 3189ae82921SPaul Mullowney } 3199ae82921SPaul Mullowney 320087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 321087f3262SPaul Mullowney { 322da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323087f3262SPaul Mullowney PetscErrorCode ierr; 324087f3262SPaul Mullowney 325087f3262SPaul Mullowney PetscFunctionBegin; 326da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 328087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 329087f3262SPaul Mullowney PetscFunctionReturn(0); 330087f3262SPaul Mullowney } 331087f3262SPaul Mullowney 332087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 333087f3262SPaul Mullowney { 334da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335087f3262SPaul Mullowney PetscErrorCode ierr; 336087f3262SPaul Mullowney 337087f3262SPaul Mullowney PetscFunctionBegin; 338da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 340087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 341087f3262SPaul Mullowney PetscFunctionReturn(0); 342087f3262SPaul Mullowney } 343087f3262SPaul Mullowney 344087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3459ae82921SPaul Mullowney { 3469ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3479ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3489ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 349aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3509ae82921SPaul Mullowney cusparseStatus_t stat; 3519ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3529ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3539ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3549ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 355b175d8bbSPaul Mullowney PetscErrorCode ierr; 35657d48284SJunchao Zhang cudaError_t cerr; 3579ae82921SPaul Mullowney 3589ae82921SPaul Mullowney PetscFunctionBegin; 359cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 360c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3619ae82921SPaul Mullowney try { 3629ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3639ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 364da79fbbcSStefano Zampini if (!loTriFactor) { 3652cbc15d9SMark PetscScalar *AALo; 3662cbc15d9SMark 3672cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3689ae82921SPaul Mullowney 3699ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 37057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 37157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3729ae82921SPaul Mullowney 3739ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3749ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3759ae82921SPaul Mullowney AiLo[n] = nzLower; 3769ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3779ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3789ae82921SPaul Mullowney v = aa; 3799ae82921SPaul Mullowney vi = aj; 3809ae82921SPaul Mullowney offset = 1; 3819ae82921SPaul Mullowney rowOffset= 1; 3829ae82921SPaul Mullowney for (i=1; i<n; i++) { 3839ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 384e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3859ae82921SPaul Mullowney AiLo[i] = rowOffset; 3869ae82921SPaul Mullowney rowOffset += nz+1; 3879ae82921SPaul Mullowney 388580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 389580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3909ae82921SPaul Mullowney 3919ae82921SPaul Mullowney offset += nz; 3929ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3939ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3949ae82921SPaul Mullowney offset += 1; 3959ae82921SPaul Mullowney 3969ae82921SPaul Mullowney v += nz; 3979ae82921SPaul Mullowney vi += nz; 3989ae82921SPaul Mullowney } 3992205254eSKarl Rupp 400aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 401da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 402da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 403aa372e3fSPaul Mullowney /* Create the matrix description */ 40457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 40557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4061b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 407afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 408afb2bd1cSJunchao Zhang #else 40957d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 410afb2bd1cSJunchao Zhang #endif 41157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 41257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 413aa372e3fSPaul Mullowney 414aa372e3fSPaul Mullowney /* set the operation */ 415aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 416aa372e3fSPaul Mullowney 417aa372e3fSPaul Mullowney /* set the matrix */ 418aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 419aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 420aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 421aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 422aa372e3fSPaul Mullowney 423aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 424aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 425aa372e3fSPaul Mullowney 426aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 427aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 428aa372e3fSPaul Mullowney 429aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 430aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 431aa372e3fSPaul Mullowney 432afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 433da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 434afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4351b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 436afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 437afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 438afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 439afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 440afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 441afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 442afb2bd1cSJunchao Zhang #endif 443afb2bd1cSJunchao Zhang 444aa372e3fSPaul Mullowney /* perform the solve analysis */ 445aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 446aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 447aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 448afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 4491b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 450afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 451afb2bd1cSJunchao Zhang #endif 452afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 453da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 454da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455aa372e3fSPaul Mullowney 456da79fbbcSStefano Zampini /* assign the pointer */ 457aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4582cbc15d9SMark loTriFactor->AA_h = AALo; 45957d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 46057d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4614863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 462da79fbbcSStefano Zampini } else { /* update values only */ 4632cbc15d9SMark if (!loTriFactor->AA_h) { 4642cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4652cbc15d9SMark } 466da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4672cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 468da79fbbcSStefano Zampini v = aa; 469da79fbbcSStefano Zampini vi = aj; 470da79fbbcSStefano Zampini offset = 1; 471da79fbbcSStefano Zampini for (i=1; i<n; i++) { 472da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4732cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 474da79fbbcSStefano Zampini offset += nz; 4752cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 476da79fbbcSStefano Zampini offset += 1; 477da79fbbcSStefano Zampini v += nz; 478da79fbbcSStefano Zampini } 4792cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 480da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 481da79fbbcSStefano Zampini } 4829ae82921SPaul Mullowney } catch(char *ex) { 4839ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4849ae82921SPaul Mullowney } 4859ae82921SPaul Mullowney } 4869ae82921SPaul Mullowney PetscFunctionReturn(0); 4879ae82921SPaul Mullowney } 4889ae82921SPaul Mullowney 489087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4909ae82921SPaul Mullowney { 4919ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4929ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4939ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 494aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4959ae82921SPaul Mullowney cusparseStatus_t stat; 4969ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4979ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4989ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4999ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5009ae82921SPaul Mullowney PetscErrorCode ierr; 50157d48284SJunchao Zhang cudaError_t cerr; 5029ae82921SPaul Mullowney 5039ae82921SPaul Mullowney PetscFunctionBegin; 504cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 505c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5069ae82921SPaul Mullowney try { 5079ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5089ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 509da79fbbcSStefano Zampini if (!upTriFactor) { 5102cbc15d9SMark PetscScalar *AAUp; 5112cbc15d9SMark 5122cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5132cbc15d9SMark 5149ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 51557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 51657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5179ae82921SPaul Mullowney 5189ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5199ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5209ae82921SPaul Mullowney AiUp[n]=nzUpper; 5219ae82921SPaul Mullowney offset = nzUpper; 5229ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5239ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5249ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5259ae82921SPaul Mullowney 526e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5279ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5289ae82921SPaul Mullowney 529e057df02SPaul Mullowney /* decrement the offset */ 5309ae82921SPaul Mullowney offset -= (nz+1); 5319ae82921SPaul Mullowney 532e057df02SPaul Mullowney /* first, set the diagonal elements */ 5339ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 53409f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5359ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5369ae82921SPaul Mullowney 537580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 538580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5399ae82921SPaul Mullowney } 5402205254eSKarl Rupp 541aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 542da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 543da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5442205254eSKarl Rupp 545aa372e3fSPaul Mullowney /* Create the matrix description */ 54657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 54757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5481b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 549afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 550afb2bd1cSJunchao Zhang #else 55157d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 552afb2bd1cSJunchao Zhang #endif 55357d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 55457d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 555aa372e3fSPaul Mullowney 556aa372e3fSPaul Mullowney /* set the operation */ 557aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 558aa372e3fSPaul Mullowney 559aa372e3fSPaul Mullowney /* set the matrix */ 560aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 561aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 562aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 563aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 564aa372e3fSPaul Mullowney 565aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 566aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 567aa372e3fSPaul Mullowney 568aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 569aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 570aa372e3fSPaul Mullowney 571aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 572aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 573aa372e3fSPaul Mullowney 574afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 575da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 576afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5771b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 578afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 579afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 580afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 581afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 582afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 583afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 584afb2bd1cSJunchao Zhang #endif 585afb2bd1cSJunchao Zhang 586aa372e3fSPaul Mullowney /* perform the solve analysis */ 587aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 588aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 589aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 590afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 5911b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 592afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 593afb2bd1cSJunchao Zhang #endif 594afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 595da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 596da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 597aa372e3fSPaul Mullowney 598da79fbbcSStefano Zampini /* assign the pointer */ 599aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6002cbc15d9SMark upTriFactor->AA_h = AAUp; 60157d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 60257d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6034863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 604da79fbbcSStefano Zampini } else { 6052cbc15d9SMark if (!upTriFactor->AA_h) { 6062cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6072cbc15d9SMark } 608da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 609da79fbbcSStefano Zampini offset = nzUpper; 610da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 611da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 612da79fbbcSStefano Zampini 613da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 614da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 615da79fbbcSStefano Zampini 616da79fbbcSStefano Zampini /* decrement the offset */ 617da79fbbcSStefano Zampini offset -= (nz+1); 618da79fbbcSStefano Zampini 619da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6202cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6212cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 622da79fbbcSStefano Zampini } 6232cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 624da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 625da79fbbcSStefano Zampini } 6269ae82921SPaul Mullowney } catch(char *ex) { 6279ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6289ae82921SPaul Mullowney } 6299ae82921SPaul Mullowney } 6309ae82921SPaul Mullowney PetscFunctionReturn(0); 6319ae82921SPaul Mullowney } 6329ae82921SPaul Mullowney 633087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6349ae82921SPaul Mullowney { 6359ae82921SPaul Mullowney PetscErrorCode ierr; 6369ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6379ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6389ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6399ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6409ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6419ae82921SPaul Mullowney 6429ae82921SPaul Mullowney PetscFunctionBegin; 643da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 644087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 645087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6462205254eSKarl Rupp 647da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 648aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6499ae82921SPaul Mullowney 650c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 651e057df02SPaul Mullowney /* lower triangular indices */ 6529ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 653da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 654da79fbbcSStefano Zampini const PetscInt *r; 655da79fbbcSStefano Zampini 656da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 657aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 658aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6599ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 660da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 661da79fbbcSStefano Zampini } 6629ae82921SPaul Mullowney 663e057df02SPaul Mullowney /* upper triangular indices */ 6649ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 665da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 666da79fbbcSStefano Zampini const PetscInt *c; 667da79fbbcSStefano Zampini 668da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 669aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 670aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6719ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 672da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 673da79fbbcSStefano Zampini } 6749ae82921SPaul Mullowney PetscFunctionReturn(0); 6759ae82921SPaul Mullowney } 6769ae82921SPaul Mullowney 677087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 678087f3262SPaul Mullowney { 679087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 680087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 681aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 682aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 683087f3262SPaul Mullowney cusparseStatus_t stat; 684087f3262SPaul Mullowney PetscErrorCode ierr; 68557d48284SJunchao Zhang cudaError_t cerr; 686087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 687087f3262SPaul Mullowney PetscScalar *AAUp; 688087f3262SPaul Mullowney PetscScalar *AALo; 689087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 690087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 691087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 692087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 693087f3262SPaul Mullowney 694087f3262SPaul Mullowney PetscFunctionBegin; 695cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 696c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 697087f3262SPaul Mullowney try { 698da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 699da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 700da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 701087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 70257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 70357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 704087f3262SPaul Mullowney 705087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 706087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 707087f3262SPaul Mullowney AiUp[n]=nzUpper; 708087f3262SPaul Mullowney offset = 0; 709087f3262SPaul Mullowney for (i=0; i<n; i++) { 710087f3262SPaul Mullowney /* set the pointers */ 711087f3262SPaul Mullowney v = aa + ai[i]; 712087f3262SPaul Mullowney vj = aj + ai[i]; 713087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 714087f3262SPaul Mullowney 715087f3262SPaul Mullowney /* first, set the diagonal elements */ 716087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 71709f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 718087f3262SPaul Mullowney AiUp[i] = offset; 71909f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 720087f3262SPaul Mullowney 721087f3262SPaul Mullowney offset+=1; 722087f3262SPaul Mullowney if (nz>0) { 723f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 724580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 725087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 726087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 727087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 728087f3262SPaul Mullowney } 729087f3262SPaul Mullowney offset+=nz; 730087f3262SPaul Mullowney } 731087f3262SPaul Mullowney } 732087f3262SPaul Mullowney 733aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 734da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 735da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 736087f3262SPaul Mullowney 737aa372e3fSPaul Mullowney /* Create the matrix description */ 73857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 73957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 741afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 742afb2bd1cSJunchao Zhang #else 74357d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 744afb2bd1cSJunchao Zhang #endif 74557d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 74657d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 747087f3262SPaul Mullowney 748aa372e3fSPaul Mullowney /* set the matrix */ 749aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 750aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 751aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 752aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 753aa372e3fSPaul Mullowney 754aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 755aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 756aa372e3fSPaul Mullowney 757aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 758aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 759aa372e3fSPaul Mullowney 760aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 761aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 762aa372e3fSPaul Mullowney 763afb2bd1cSJunchao Zhang /* set the operation */ 764afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 765afb2bd1cSJunchao Zhang 766afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 767da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 768afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7691b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 770afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 771afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 772afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 773afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 774afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 775afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 776afb2bd1cSJunchao Zhang #endif 777afb2bd1cSJunchao Zhang 778aa372e3fSPaul Mullowney /* perform the solve analysis */ 779aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 780aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 781aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 782afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 7831b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 784afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 785afb2bd1cSJunchao Zhang #endif 786afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 787da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 788da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 789aa372e3fSPaul Mullowney 790da79fbbcSStefano Zampini /* assign the pointer */ 791aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 792aa372e3fSPaul Mullowney 793aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 794da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 795da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 796aa372e3fSPaul Mullowney 797aa372e3fSPaul Mullowney /* Create the matrix description */ 79857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 79957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 801afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 802afb2bd1cSJunchao Zhang #else 80357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 804afb2bd1cSJunchao Zhang #endif 80557d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 80657d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 807aa372e3fSPaul Mullowney 808aa372e3fSPaul Mullowney /* set the operation */ 809aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 810aa372e3fSPaul Mullowney 811aa372e3fSPaul Mullowney /* set the matrix */ 812aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 813aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 814aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 815aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 816aa372e3fSPaul Mullowney 817aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 818aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 819aa372e3fSPaul Mullowney 820aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 821aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 822aa372e3fSPaul Mullowney 823aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 824aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 825aa372e3fSPaul Mullowney 826afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 827da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 828afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 830afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 831afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 832afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 833afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 834afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 835afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 836afb2bd1cSJunchao Zhang #endif 837afb2bd1cSJunchao Zhang 838aa372e3fSPaul Mullowney /* perform the solve analysis */ 839aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 840aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 841aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 842afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 8431b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 844afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 845afb2bd1cSJunchao Zhang #endif 846afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 847da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 848da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 849aa372e3fSPaul Mullowney 850da79fbbcSStefano Zampini /* assign the pointer */ 851aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 852087f3262SPaul Mullowney 853da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 85457d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 85557d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 856da79fbbcSStefano Zampini } else { 857da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 858da79fbbcSStefano Zampini offset = 0; 859da79fbbcSStefano Zampini for (i=0; i<n; i++) { 860da79fbbcSStefano Zampini /* set the pointers */ 861da79fbbcSStefano Zampini v = aa + ai[i]; 862da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 863da79fbbcSStefano Zampini 864da79fbbcSStefano Zampini /* first, set the diagonal elements */ 865da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 866da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 867da79fbbcSStefano Zampini 868da79fbbcSStefano Zampini offset+=1; 869da79fbbcSStefano Zampini if (nz>0) { 870da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 871da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 872da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 873da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 874da79fbbcSStefano Zampini } 875da79fbbcSStefano Zampini offset+=nz; 876da79fbbcSStefano Zampini } 877da79fbbcSStefano Zampini } 878da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 879da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 880da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 881da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 882da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 883da79fbbcSStefano Zampini } 88457d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 88557d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 886087f3262SPaul Mullowney } catch(char *ex) { 887087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 888087f3262SPaul Mullowney } 889087f3262SPaul Mullowney } 890087f3262SPaul Mullowney PetscFunctionReturn(0); 891087f3262SPaul Mullowney } 892087f3262SPaul Mullowney 893087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8949ae82921SPaul Mullowney { 8959ae82921SPaul Mullowney PetscErrorCode ierr; 896087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 897087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 898087f3262SPaul Mullowney IS ip = a->row; 899087f3262SPaul Mullowney PetscBool perm_identity; 900087f3262SPaul Mullowney PetscInt n = A->rmap->n; 901087f3262SPaul Mullowney 902087f3262SPaul Mullowney PetscFunctionBegin; 903da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 904087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 905da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 906aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 907aa372e3fSPaul Mullowney 908da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 909da79fbbcSStefano Zampini 910087f3262SPaul Mullowney /* lower triangular indices */ 911087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 912087f3262SPaul Mullowney if (!perm_identity) { 9134e4bbfaaSStefano Zampini IS iip; 914da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9154e4bbfaaSStefano Zampini 9164e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9174e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 918da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 919aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 920aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 921aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9224e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9234e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9244e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 925087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 926da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 927da79fbbcSStefano Zampini } 928087f3262SPaul Mullowney PetscFunctionReturn(0); 929087f3262SPaul Mullowney } 930087f3262SPaul Mullowney 9316fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 9329ae82921SPaul Mullowney { 9339ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 9349ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 9359ae82921SPaul Mullowney PetscBool row_identity,col_identity; 936b175d8bbSPaul Mullowney PetscErrorCode ierr; 9379ae82921SPaul Mullowney 9389ae82921SPaul Mullowney PetscFunctionBegin; 9399ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 940ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 941e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 9429ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 9439ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 944bda325fcSPaul Mullowney if (row_identity && col_identity) { 945bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 946bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9474e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9484e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 949bda325fcSPaul Mullowney } else { 950bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 951bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9524e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9534e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 954bda325fcSPaul Mullowney } 9558dc1d2a3SPaul Mullowney 956e057df02SPaul Mullowney /* get the triangular factors */ 957087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 9589ae82921SPaul Mullowney PetscFunctionReturn(0); 9599ae82921SPaul Mullowney } 9609ae82921SPaul Mullowney 961087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 962087f3262SPaul Mullowney { 963087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 964087f3262SPaul Mullowney IS ip = b->row; 965087f3262SPaul Mullowney PetscBool perm_identity; 966b175d8bbSPaul Mullowney PetscErrorCode ierr; 967087f3262SPaul Mullowney 968087f3262SPaul Mullowney PetscFunctionBegin; 969087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 971087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 972087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973087f3262SPaul Mullowney if (perm_identity) { 974087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9764e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9774e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 978087f3262SPaul Mullowney } else { 979087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9814e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9824e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 983087f3262SPaul Mullowney } 984087f3262SPaul Mullowney 985087f3262SPaul Mullowney /* get the triangular factors */ 986087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987087f3262SPaul Mullowney PetscFunctionReturn(0); 988087f3262SPaul Mullowney } 9899ae82921SPaul Mullowney 990b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991bda325fcSPaul Mullowney { 992bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997bda325fcSPaul Mullowney cusparseStatus_t stat; 998aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 999aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1000aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1001aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10021b0a6780SStefano Zampini cudaError_t cerr; 1003da79fbbcSStefano Zampini PetscErrorCode ierr; 1004b175d8bbSPaul Mullowney 1005bda325fcSPaul Mullowney PetscFunctionBegin; 1006aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1007da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009aa372e3fSPaul Mullowney 1010aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1011aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1012aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016aa372e3fSPaul Mullowney 1017aa372e3fSPaul Mullowney /* Create the matrix description */ 101857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 101957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 102057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 102157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 102257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023aa372e3fSPaul Mullowney 1024aa372e3fSPaul Mullowney /* set the operation */ 1025aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026aa372e3fSPaul Mullowney 1027aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1028aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1029afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035aa372e3fSPaul Mullowney 1036aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1041afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1042afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1043afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1044afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1046afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10471b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048afb2bd1cSJunchao Zhang #endif 1049afb2bd1cSJunchao Zhang 1050da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1054aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1055aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1056aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1057afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1060afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1061afb2bd1cSJunchao Zhang #else 1062afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1064afb2bd1cSJunchao Zhang #endif 1065afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1066da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1067da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1068aa372e3fSPaul Mullowney 1069afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1070da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1071afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1073afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1074afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1075afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1077afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1078afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1079afb2bd1cSJunchao Zhang #endif 1080afb2bd1cSJunchao Zhang 1081afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1082aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1083afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1084afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1085afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 10861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1087afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1088afb2bd1cSJunchao Zhang #endif 1089afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1090da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1091da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1092aa372e3fSPaul Mullowney 1093da79fbbcSStefano Zampini /* assign the pointer */ 1094aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1095aa372e3fSPaul Mullowney 1096aa372e3fSPaul Mullowney /*********************************************/ 1097aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1098aa372e3fSPaul Mullowney /*********************************************/ 1099aa372e3fSPaul Mullowney 1100aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1101da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1102da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1103aa372e3fSPaul Mullowney 1104aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1105aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1106aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1107aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1108aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1109aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1110aa372e3fSPaul Mullowney 1111aa372e3fSPaul Mullowney /* Create the matrix description */ 111257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 111357d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 111457d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 111557d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 111657d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1117aa372e3fSPaul Mullowney 1118aa372e3fSPaul Mullowney /* set the operation */ 1119aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1120aa372e3fSPaul Mullowney 1121aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1122aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1123afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1124afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1125aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1126afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1127afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1128afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1129aa372e3fSPaul Mullowney 1130aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1131afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1132afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1133afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1134afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1135afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1136afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1137afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1138afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1139afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1140afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1141afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1142afb2bd1cSJunchao Zhang #endif 1143afb2bd1cSJunchao Zhang 1144da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1145aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1146aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1147aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1148aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1149aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1150aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1151afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1152afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1153afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1154afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1155afb2bd1cSJunchao Zhang #else 1156afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1157afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1158afb2bd1cSJunchao Zhang #endif 1159afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1160da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1161da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1162aa372e3fSPaul Mullowney 1163afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1164da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1165afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11661b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1167afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1168afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1169afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1170afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1171afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1172afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1173afb2bd1cSJunchao Zhang #endif 1174afb2bd1cSJunchao Zhang 1175afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1176aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1177afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1178afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1179afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 11801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1181afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1182afb2bd1cSJunchao Zhang #endif 1183afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1184da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1185da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1186aa372e3fSPaul Mullowney 1187da79fbbcSStefano Zampini /* assign the pointer */ 1188aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1189bda325fcSPaul Mullowney PetscFunctionReturn(0); 1190bda325fcSPaul Mullowney } 1191bda325fcSPaul Mullowney 1192b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1193bda325fcSPaul Mullowney { 1194aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1195aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1196aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1197bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1198bda325fcSPaul Mullowney cusparseStatus_t stat; 1199aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1200b06137fdSPaul Mullowney cudaError_t err; 120185ba7357SStefano Zampini PetscErrorCode ierr; 1202b175d8bbSPaul Mullowney 1203bda325fcSPaul Mullowney PetscFunctionBegin; 120485ba7357SStefano Zampini if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 120585ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 120685ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 120785ba7357SStefano Zampini /* create cusparse matrix */ 1208aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 120957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1210aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 121157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 121257d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1213aa372e3fSPaul Mullowney 1214b06137fdSPaul Mullowney /* set alpha and beta */ 1215afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12167656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12177656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1218afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12197656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12207656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 122157d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1222b06137fdSPaul Mullowney 1223aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1224aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1225aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 1226554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1227554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1228aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1229a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1230aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1231aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1232a3fdcf43SKarl Rupp 123381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 123481902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1235afb2bd1cSJunchao Zhang 123681902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 1237afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1238afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1239afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1240afb2bd1cSJunchao Zhang matrix->values->data().get(), 1241afb2bd1cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1242afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1243afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1244afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1245afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1246afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1247afb2bd1cSJunchao Zhang err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1248afb2bd1cSJunchao Zhang #endif 1249afb2bd1cSJunchao Zhang 1250a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1251a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 1252aa372e3fSPaul Mullowney matrix->values->data().get(), 125381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1254aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1255aa372e3fSPaul Mullowney matrixT->values->data().get(), 1256afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1258afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1259afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1260afb2bd1cSJunchao Zhang #else 1261afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1262afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1263afb2bd1cSJunchao Zhang #endif 1264afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1265aa372e3fSPaul Mullowney matstructT->mat = matrixT; 1266afb2bd1cSJunchao Zhang 1267afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1269afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1270afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1271afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1272afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1273afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1274afb2bd1cSJunchao Zhang #endif 1275aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1276afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1277afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1278afb2bd1cSJunchao Zhang #else 1279aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 128051c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 128151c6d536SStefano Zampini /* First convert HYB to CSR */ 1282aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1283aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1284aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1285aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1286aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1287aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1288aa372e3fSPaul Mullowney 1289aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1290aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1291aa372e3fSPaul Mullowney temp->values->data().get(), 1292aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 129357d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1294aa372e3fSPaul Mullowney 1295aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1297aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1298aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1299aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1300aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1302aa372e3fSPaul Mullowney 1303aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1304aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1305aa372e3fSPaul Mullowney temp->values->data().get(), 1306aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1307aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1308aa372e3fSPaul Mullowney tempT->values->data().get(), 1309aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1310aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 131157d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1312aa372e3fSPaul Mullowney 1313aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1314aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 131557d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1316aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1317aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1319aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1320aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1321aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 132257d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1323aa372e3fSPaul Mullowney 1324aa372e3fSPaul Mullowney /* assign the pointer */ 1325aa372e3fSPaul Mullowney matstructT->mat = hybMat; 1326aa372e3fSPaul Mullowney /* delete temporaries */ 1327aa372e3fSPaul Mullowney if (tempT) { 1328aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1329aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1330aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1331aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1332087f3262SPaul Mullowney } 1333aa372e3fSPaul Mullowney if (temp) { 1334aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1335aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1336aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1337aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1338aa372e3fSPaul Mullowney } 1339afb2bd1cSJunchao Zhang #endif 1340aa372e3fSPaul Mullowney } 134105035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 134285ba7357SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 134385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1344213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1345213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1346aa372e3fSPaul Mullowney /* assign the pointer */ 1347aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1348bda325fcSPaul Mullowney PetscFunctionReturn(0); 1349bda325fcSPaul Mullowney } 1350bda325fcSPaul Mullowney 13514e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 13526fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1353bda325fcSPaul Mullowney { 1354c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1355465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1356465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1357465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1358465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1359bda325fcSPaul Mullowney cusparseStatus_t stat; 1360bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1361aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1362aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1363aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1364b175d8bbSPaul Mullowney PetscErrorCode ierr; 136557d48284SJunchao Zhang cudaError_t cerr; 1366bda325fcSPaul Mullowney 1367bda325fcSPaul Mullowney PetscFunctionBegin; 1368aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1369aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1370bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1371aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1372aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1373bda325fcSPaul Mullowney } 1374bda325fcSPaul Mullowney 1375bda325fcSPaul Mullowney /* Get the GPU pointers */ 1376c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1377c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1378c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1379c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1380bda325fcSPaul Mullowney 13817a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1382aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1383c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1384c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1385c41cb2e2SAlejandro Lamas Daviña xGPU); 1386aa372e3fSPaul Mullowney 1387aa372e3fSPaul Mullowney /* First, solve U */ 1388aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1389afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 13901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1391afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1392afb2bd1cSJunchao Zhang #endif 1393afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1394aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1395aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1396aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1397aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1398afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 13991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1400afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1401afb2bd1cSJunchao Zhang #endif 1402afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1403aa372e3fSPaul Mullowney 1404aa372e3fSPaul Mullowney /* Then, solve L */ 1405aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1406afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14071b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1408afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1409afb2bd1cSJunchao Zhang #endif 1410afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1411aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1412aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1413aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1414aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1415afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1417afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1418afb2bd1cSJunchao Zhang #endif 1419afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1420aa372e3fSPaul Mullowney 1421aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1422c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1423c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1424aa372e3fSPaul Mullowney tempGPU->begin()); 1425aa372e3fSPaul Mullowney 1426aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1427c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1428bda325fcSPaul Mullowney 1429bda325fcSPaul Mullowney /* restore */ 1430c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1431c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 143205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1433661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1434958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1435bda325fcSPaul Mullowney PetscFunctionReturn(0); 1436bda325fcSPaul Mullowney } 1437bda325fcSPaul Mullowney 14386fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1439bda325fcSPaul Mullowney { 1440465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1441465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1442bda325fcSPaul Mullowney cusparseStatus_t stat; 1443bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1444aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1445aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1446aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1447b175d8bbSPaul Mullowney PetscErrorCode ierr; 144857d48284SJunchao Zhang cudaError_t cerr; 1449bda325fcSPaul Mullowney 1450bda325fcSPaul Mullowney PetscFunctionBegin; 1451aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1452aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1453bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1454aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1455aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1456bda325fcSPaul Mullowney } 1457bda325fcSPaul Mullowney 1458bda325fcSPaul Mullowney /* Get the GPU pointers */ 1459c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1460c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1461bda325fcSPaul Mullowney 14627a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1463aa372e3fSPaul Mullowney /* First, solve U */ 1464aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1465afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14661b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1467afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1468afb2bd1cSJunchao Zhang #endif 1469afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1470aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1471aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1472aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1473aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1474afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 14751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1476afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1477afb2bd1cSJunchao Zhang #endif 1478afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1479aa372e3fSPaul Mullowney 1480aa372e3fSPaul Mullowney /* Then, solve L */ 1481aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1482afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14831b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1485afb2bd1cSJunchao Zhang #endif 1486afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1487aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1488aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1489aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1490aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1491afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1493afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1494afb2bd1cSJunchao Zhang #endif 1495afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1496bda325fcSPaul Mullowney 1497bda325fcSPaul Mullowney /* restore */ 1498c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1499c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 150005035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1501661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1502958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1503bda325fcSPaul Mullowney PetscFunctionReturn(0); 1504bda325fcSPaul Mullowney } 1505bda325fcSPaul Mullowney 15066fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15079ae82921SPaul Mullowney { 1508465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1509465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1510465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1511465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15129ae82921SPaul Mullowney cusparseStatus_t stat; 15139ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1514aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1515aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1516aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1517b175d8bbSPaul Mullowney PetscErrorCode ierr; 151857d48284SJunchao Zhang cudaError_t cerr; 15199ae82921SPaul Mullowney 15209ae82921SPaul Mullowney PetscFunctionBegin; 1521ebc8f436SDominic Meiser 1522e057df02SPaul Mullowney /* Get the GPU pointers */ 1523c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1526c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15279ae82921SPaul Mullowney 15287a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1529aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1530c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1531c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15324e4bbfaaSStefano Zampini tempGPU->begin()); 1533aa372e3fSPaul Mullowney 1534aa372e3fSPaul Mullowney /* Next, solve L */ 1535aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1536afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15371b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1538afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1539afb2bd1cSJunchao Zhang #endif 1540afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1541aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1542aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1543aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1544aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1545afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 15461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1547afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1548afb2bd1cSJunchao Zhang #endif 1549afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1550aa372e3fSPaul Mullowney 1551aa372e3fSPaul Mullowney /* Then, solve U */ 1552aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1553afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1556afb2bd1cSJunchao Zhang #endif 1557afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1558aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1559aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1560aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1561aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1562afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 15631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1565afb2bd1cSJunchao Zhang #endif 1566afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1567aa372e3fSPaul Mullowney 15684e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 15694e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 15704e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 15714e4bbfaaSStefano Zampini xGPU); 15729ae82921SPaul Mullowney 1573c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1574c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 157505035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1576661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1577958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 15789ae82921SPaul Mullowney PetscFunctionReturn(0); 15799ae82921SPaul Mullowney } 15809ae82921SPaul Mullowney 15816fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 15829ae82921SPaul Mullowney { 1583465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1584465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 15859ae82921SPaul Mullowney cusparseStatus_t stat; 15869ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1587aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1588aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1589aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1590b175d8bbSPaul Mullowney PetscErrorCode ierr; 159157d48284SJunchao Zhang cudaError_t cerr; 15929ae82921SPaul Mullowney 15939ae82921SPaul Mullowney PetscFunctionBegin; 1594e057df02SPaul Mullowney /* Get the GPU pointers */ 1595c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1596c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 15979ae82921SPaul Mullowney 15987a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1599aa372e3fSPaul Mullowney /* First, solve L */ 1600aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1601afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16021b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1603afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1604afb2bd1cSJunchao Zhang #endif 1605afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1606aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1607aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1608aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1609aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1610afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 16111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1613afb2bd1cSJunchao Zhang #endif 1614afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1615aa372e3fSPaul Mullowney 1616aa372e3fSPaul Mullowney /* Next, solve U */ 1617aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1618afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16191b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1620afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1621afb2bd1cSJunchao Zhang #endif 1622afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1623aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1624aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1625aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1626aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1627afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 16281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1629afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1630afb2bd1cSJunchao Zhang #endif 1631afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 16329ae82921SPaul Mullowney 1633c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1634c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 163505035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1636661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1637958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16389ae82921SPaul Mullowney PetscFunctionReturn(0); 16399ae82921SPaul Mullowney } 16409ae82921SPaul Mullowney 16417e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 16427e8381f9SStefano Zampini { 16437e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16447e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 16457e8381f9SStefano Zampini cudaError_t cerr; 16467e8381f9SStefano Zampini PetscErrorCode ierr; 16477e8381f9SStefano Zampini 16487e8381f9SStefano Zampini PetscFunctionBegin; 16497e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 16507e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 16517e8381f9SStefano Zampini 16527e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16537e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 16547e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 16557e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 16567e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16577e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 16587e8381f9SStefano Zampini } 16597e8381f9SStefano Zampini PetscFunctionReturn(0); 16607e8381f9SStefano Zampini } 16617e8381f9SStefano Zampini 16627e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 16637e8381f9SStefano Zampini { 16647e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16657e8381f9SStefano Zampini PetscErrorCode ierr; 16667e8381f9SStefano Zampini 16677e8381f9SStefano Zampini PetscFunctionBegin; 16687e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 16697e8381f9SStefano Zampini *array = a->a; 16707e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 16717e8381f9SStefano Zampini PetscFunctionReturn(0); 16727e8381f9SStefano Zampini } 16737e8381f9SStefano Zampini 16746fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 16759ae82921SPaul Mullowney { 1676aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 16777c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 16789ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1679213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 16809ae82921SPaul Mullowney PetscErrorCode ierr; 1681aa372e3fSPaul Mullowney cusparseStatus_t stat; 1682b06137fdSPaul Mullowney cudaError_t err; 16839ae82921SPaul Mullowney 16849ae82921SPaul Mullowney PetscFunctionBegin; 168595639643SRichard Tran Mills if (A->boundtocpu) PetscFunctionReturn(0); 1686c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 168781902715SJunchao Zhang if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 168881902715SJunchao Zhang /* Copy values only */ 1689afb2bd1cSJunchao Zhang CsrMatrix *matrix,*matrixT; 1690afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 169185ba7357SStefano Zampini 169285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1693afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 169405035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 16954863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 169685ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 169781902715SJunchao Zhang 169881902715SJunchao Zhang /* Update matT when it was built before */ 169981902715SJunchao Zhang if (cusparsestruct->matTranspose) { 170081902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1701afb2bd1cSJunchao Zhang matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 170285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 170381902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1704afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1705afb2bd1cSJunchao Zhang matrix->values->data().get(), 170681902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1707afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1708afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1709afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1710afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1711afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1712afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1713afb2bd1cSJunchao Zhang #else 1714afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1715afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1716afb2bd1cSJunchao Zhang #endif 1717afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 171805035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 171985ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 172081902715SJunchao Zhang } 172134d6c7a5SJose E. Roman } else { 172285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17237c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 17247c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 17257c700b8dSJunchao Zhang delete cusparsestruct->workVector; 172681902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 17279ae82921SPaul Mullowney try { 17289ae82921SPaul Mullowney if (a->compressedrow.use) { 17299ae82921SPaul Mullowney m = a->compressedrow.nrows; 17309ae82921SPaul Mullowney ii = a->compressedrow.i; 17319ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 17329ae82921SPaul Mullowney } else { 1733213423ffSJunchao Zhang m = A->rmap->n; 1734213423ffSJunchao Zhang ii = a->i; 1735e6e9a74fSStefano Zampini ridx = NULL; 17369ae82921SPaul Mullowney } 1737213423ffSJunchao Zhang cusparsestruct->nrows = m; 17389ae82921SPaul Mullowney 173985ba7357SStefano Zampini /* create cusparse matrix */ 1740aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 174157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 174257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 174357d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 17449ae82921SPaul Mullowney 1745afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 17467656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 17477656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1748afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17497656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17507656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 175157d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1752b06137fdSPaul Mullowney 1753aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1754aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1755aa372e3fSPaul Mullowney /* set the matrix */ 1756afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1757afb2bd1cSJunchao Zhang mat->num_rows = m; 1758afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1759afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1760afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1761afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 17629ae82921SPaul Mullowney 1763afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1764afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1765aa372e3fSPaul Mullowney 1766afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1767afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1768aa372e3fSPaul Mullowney 1769aa372e3fSPaul Mullowney /* assign the pointer */ 1770afb2bd1cSJunchao Zhang matstruct->mat = mat; 1771afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1772afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1773afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1774afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1775afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1776afb2bd1cSJunchao Zhang mat->values->data().get(), 1777afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1778afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1779afb2bd1cSJunchao Zhang } 1780afb2bd1cSJunchao Zhang #endif 1781aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1782afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1783afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1784afb2bd1cSJunchao Zhang #else 1785afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1786afb2bd1cSJunchao Zhang mat->num_rows = m; 1787afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1788afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1789afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1790afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1791aa372e3fSPaul Mullowney 1792afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1793afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1794aa372e3fSPaul Mullowney 1795afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1796afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1797aa372e3fSPaul Mullowney 1798aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 179957d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1800aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1801aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1802afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1803afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1804afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1805afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 180657d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1807aa372e3fSPaul Mullowney /* assign the pointer */ 1808aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1809aa372e3fSPaul Mullowney 1810afb2bd1cSJunchao Zhang if (mat) { 1811afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1812afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1813afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1814afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1815087f3262SPaul Mullowney } 1816afb2bd1cSJunchao Zhang #endif 1817087f3262SPaul Mullowney } 1818ca45077fSPaul Mullowney 1819aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1820213423ffSJunchao Zhang if (a->compressedrow.use) { 1821213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1822aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1823aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1824213423ffSJunchao Zhang tmp = m; 1825213423ffSJunchao Zhang } else { 1826213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1827213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1828213423ffSJunchao Zhang tmp = 0; 1829213423ffSJunchao Zhang } 1830213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1831aa372e3fSPaul Mullowney 1832aa372e3fSPaul Mullowney /* assign the pointer */ 1833aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 18349ae82921SPaul Mullowney } catch(char *ex) { 18359ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18369ae82921SPaul Mullowney } 183705035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 183885ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 183934d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 184034d6c7a5SJose E. Roman } 1841c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 18429ae82921SPaul Mullowney } 18439ae82921SPaul Mullowney PetscFunctionReturn(0); 18449ae82921SPaul Mullowney } 18459ae82921SPaul Mullowney 1846c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1847aa372e3fSPaul Mullowney { 1848aa372e3fSPaul Mullowney template <typename Tuple> 1849aa372e3fSPaul Mullowney __host__ __device__ 1850aa372e3fSPaul Mullowney void operator()(Tuple t) 1851aa372e3fSPaul Mullowney { 1852aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1853aa372e3fSPaul Mullowney } 1854aa372e3fSPaul Mullowney }; 1855aa372e3fSPaul Mullowney 18567e8381f9SStefano Zampini struct VecCUDAEquals 18577e8381f9SStefano Zampini { 18587e8381f9SStefano Zampini template <typename Tuple> 18597e8381f9SStefano Zampini __host__ __device__ 18607e8381f9SStefano Zampini void operator()(Tuple t) 18617e8381f9SStefano Zampini { 18627e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 18637e8381f9SStefano Zampini } 18647e8381f9SStefano Zampini }; 18657e8381f9SStefano Zampini 1866e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1867e6e9a74fSStefano Zampini { 1868e6e9a74fSStefano Zampini template <typename Tuple> 1869e6e9a74fSStefano Zampini __host__ __device__ 1870e6e9a74fSStefano Zampini void operator()(Tuple t) 1871e6e9a74fSStefano Zampini { 1872e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1873e6e9a74fSStefano Zampini } 1874e6e9a74fSStefano Zampini }; 1875e6e9a74fSStefano Zampini 1876afb2bd1cSJunchao Zhang struct MatMatCusparse { 1877ccdfe979SStefano Zampini PetscBool cisdense; 1878ccdfe979SStefano Zampini PetscScalar *Bt; 1879ccdfe979SStefano Zampini Mat X; 1880afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1881afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1882afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1883afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1884afb2bd1cSJunchao Zhang size_t spmmBufferSize; 1885afb2bd1cSJunchao Zhang void *spmmBuffer; 1886afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1887afb2bd1cSJunchao Zhang #endif 1888afb2bd1cSJunchao Zhang }; 1889ccdfe979SStefano Zampini 1890ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1891ccdfe979SStefano Zampini { 1892ccdfe979SStefano Zampini PetscErrorCode ierr; 1893ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1894ccdfe979SStefano Zampini cudaError_t cerr; 1895ccdfe979SStefano Zampini 1896ccdfe979SStefano Zampini PetscFunctionBegin; 1897ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1898afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1899afb2bd1cSJunchao Zhang cusparseStatus_t stat; 1900afb2bd1cSJunchao Zhang if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1901afb2bd1cSJunchao Zhang if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1902afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1903afb2bd1cSJunchao Zhang #endif 1904ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1905ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1906ccdfe979SStefano Zampini PetscFunctionReturn(0); 1907ccdfe979SStefano Zampini } 1908ccdfe979SStefano Zampini 1909ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1910ccdfe979SStefano Zampini 1911ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1912ccdfe979SStefano Zampini { 1913ccdfe979SStefano Zampini Mat_Product *product = C->product; 1914ccdfe979SStefano Zampini Mat A,B; 1915afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1916ccdfe979SStefano Zampini PetscBool flg,biscuda; 1917ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1918ccdfe979SStefano Zampini cusparseStatus_t stat; 1919ccdfe979SStefano Zampini cusparseOperation_t opA; 1920ccdfe979SStefano Zampini const PetscScalar *barray; 1921ccdfe979SStefano Zampini PetscScalar *carray; 1922ccdfe979SStefano Zampini PetscErrorCode ierr; 1923ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1924ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1925ccdfe979SStefano Zampini CsrMatrix *csrmat; 1926afb2bd1cSJunchao Zhang cudaError_t cerr; 1927ccdfe979SStefano Zampini 1928ccdfe979SStefano Zampini PetscFunctionBegin; 1929ccdfe979SStefano Zampini MatCheckProduct(C,1); 1930ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1931ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1932ccdfe979SStefano Zampini A = product->A; 1933ccdfe979SStefano Zampini B = product->B; 1934ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1935ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1936ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1937ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1938ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1939ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1940ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1941ccdfe979SStefano Zampini switch (product->type) { 1942ccdfe979SStefano Zampini case MATPRODUCT_AB: 1943ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1944ccdfe979SStefano Zampini mat = cusp->mat; 1945ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1946ccdfe979SStefano Zampini m = A->rmap->n; 1947ccdfe979SStefano Zampini n = B->cmap->n; 1948ccdfe979SStefano Zampini break; 1949ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1950e6e9a74fSStefano Zampini if (!cusp->transgen) { 1951e6e9a74fSStefano Zampini mat = cusp->mat; 1952e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1953e6e9a74fSStefano Zampini } else { 1954ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1955ccdfe979SStefano Zampini mat = cusp->matTranspose; 1956ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1957e6e9a74fSStefano Zampini } 1958ccdfe979SStefano Zampini m = A->cmap->n; 1959ccdfe979SStefano Zampini n = B->cmap->n; 1960ccdfe979SStefano Zampini break; 1961ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1962ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1963ccdfe979SStefano Zampini mat = cusp->mat; 1964ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1965ccdfe979SStefano Zampini m = A->rmap->n; 1966ccdfe979SStefano Zampini n = B->rmap->n; 1967ccdfe979SStefano Zampini break; 1968ccdfe979SStefano Zampini default: 1969ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1970ccdfe979SStefano Zampini } 1971ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1972ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1973ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1974ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1975afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1976ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1977afb2bd1cSJunchao Zhang 1978ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1979c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1980c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1981c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1982c8378d12SStefano Zampini } else { 1983c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1984c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1985c8378d12SStefano Zampini } 1986c8378d12SStefano Zampini 1987c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1988afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1989afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1990afb2bd1cSJunchao Zhang /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1991afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1992afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1993afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 1994afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1995afb2bd1cSJunchao Zhang mmdata->Blda = blda; 1996afb2bd1cSJunchao Zhang } 1997c8378d12SStefano Zampini 1998afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1999afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2000afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2001afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2002afb2bd1cSJunchao Zhang } 2003afb2bd1cSJunchao Zhang 2004afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2005afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2006afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2007afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2008afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2009afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2010afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2011afb2bd1cSJunchao Zhang } 2012afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2013afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2014afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2015afb2bd1cSJunchao Zhang cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 2016afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 2017afb2bd1cSJunchao Zhang cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 2018afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2019afb2bd1cSJunchao Zhang } else { 2020afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2021afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2022afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2023afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2024afb2bd1cSJunchao Zhang } 2025afb2bd1cSJunchao Zhang 2026afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2027afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2028afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2029afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2030afb2bd1cSJunchao Zhang cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 2031afb2bd1cSJunchao Zhang #else 2032afb2bd1cSJunchao Zhang PetscInt k; 2033afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2034ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2035ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2036ccdfe979SStefano Zampini cublasStatus_t cerr; 2037ccdfe979SStefano Zampini 2038ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2039ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2040ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2041ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2042ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2043ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2044ccdfe979SStefano Zampini blda = B->cmap->n; 2045afb2bd1cSJunchao Zhang k = B->cmap->n; 2046afb2bd1cSJunchao Zhang } else { 2047afb2bd1cSJunchao Zhang k = B->rmap->n; 2048ccdfe979SStefano Zampini } 2049ccdfe979SStefano Zampini 2050afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2051ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2052afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2053ccdfe979SStefano Zampini csrmat->values->data().get(), 2054ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2055ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2056ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2057ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2058afb2bd1cSJunchao Zhang #endif 2059afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2060c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2061c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2062ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2063ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2064ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2065ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2066ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2067ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2068ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2069ccdfe979SStefano Zampini } else { 2070ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2071ccdfe979SStefano Zampini } 2072ccdfe979SStefano Zampini if (mmdata->cisdense) { 2073ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2074ccdfe979SStefano Zampini } 2075ccdfe979SStefano Zampini if (!biscuda) { 2076ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2077ccdfe979SStefano Zampini } 2078ccdfe979SStefano Zampini PetscFunctionReturn(0); 2079ccdfe979SStefano Zampini } 2080ccdfe979SStefano Zampini 2081ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2082ccdfe979SStefano Zampini { 2083ccdfe979SStefano Zampini Mat_Product *product = C->product; 2084ccdfe979SStefano Zampini Mat A,B; 2085ccdfe979SStefano Zampini PetscInt m,n; 2086ccdfe979SStefano Zampini PetscBool cisdense,flg; 2087ccdfe979SStefano Zampini PetscErrorCode ierr; 2088ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2089ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2090ccdfe979SStefano Zampini 2091ccdfe979SStefano Zampini PetscFunctionBegin; 2092ccdfe979SStefano Zampini MatCheckProduct(C,1); 2093ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2094ccdfe979SStefano Zampini A = product->A; 2095ccdfe979SStefano Zampini B = product->B; 2096ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2097ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2098ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2099ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2100ccdfe979SStefano Zampini switch (product->type) { 2101ccdfe979SStefano Zampini case MATPRODUCT_AB: 2102ccdfe979SStefano Zampini m = A->rmap->n; 2103ccdfe979SStefano Zampini n = B->cmap->n; 2104ccdfe979SStefano Zampini break; 2105ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2106ccdfe979SStefano Zampini m = A->cmap->n; 2107ccdfe979SStefano Zampini n = B->cmap->n; 2108ccdfe979SStefano Zampini break; 2109ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2110ccdfe979SStefano Zampini m = A->rmap->n; 2111ccdfe979SStefano Zampini n = B->rmap->n; 2112ccdfe979SStefano Zampini break; 2113ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2114ccdfe979SStefano Zampini m = B->cmap->n; 2115ccdfe979SStefano Zampini n = B->cmap->n; 2116ccdfe979SStefano Zampini break; 2117ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2118ccdfe979SStefano Zampini m = B->rmap->n; 2119ccdfe979SStefano Zampini n = B->rmap->n; 2120ccdfe979SStefano Zampini break; 2121ccdfe979SStefano Zampini default: 2122ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2123ccdfe979SStefano Zampini } 2124ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2125ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2126ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2127ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2128ccdfe979SStefano Zampini 2129ccdfe979SStefano Zampini /* product data */ 2130ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2131ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2132afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2133afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2134ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2135afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2136ccdfe979SStefano Zampini } 2137afb2bd1cSJunchao Zhang #endif 2138ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2139ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2140ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2141ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2142ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2143ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2144ccdfe979SStefano Zampini } else { 2145ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2146ccdfe979SStefano Zampini } 2147ccdfe979SStefano Zampini } 2148ccdfe979SStefano Zampini C->product->data = mmdata; 2149ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2150ccdfe979SStefano Zampini 2151ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2152ccdfe979SStefano Zampini PetscFunctionReturn(0); 2153ccdfe979SStefano Zampini } 2154ccdfe979SStefano Zampini 2155ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2156ccdfe979SStefano Zampini 2157ccdfe979SStefano Zampini /* handles dense B */ 2158ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2159ccdfe979SStefano Zampini { 2160ccdfe979SStefano Zampini Mat_Product *product = C->product; 2161ccdfe979SStefano Zampini PetscErrorCode ierr; 2162ccdfe979SStefano Zampini 2163ccdfe979SStefano Zampini PetscFunctionBegin; 2164ccdfe979SStefano Zampini MatCheckProduct(C,1); 2165ccdfe979SStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2166ccdfe979SStefano Zampini if (product->A->boundtocpu) { 2167ccdfe979SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2168ccdfe979SStefano Zampini PetscFunctionReturn(0); 2169ccdfe979SStefano Zampini } 2170ccdfe979SStefano Zampini switch (product->type) { 2171ccdfe979SStefano Zampini case MATPRODUCT_AB: 2172ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2173ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2174ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2175ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2176ccdfe979SStefano Zampini C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2177ccdfe979SStefano Zampini default: 2178ccdfe979SStefano Zampini break; 2179ccdfe979SStefano Zampini } 2180ccdfe979SStefano Zampini PetscFunctionReturn(0); 2181ccdfe979SStefano Zampini } 2182ccdfe979SStefano Zampini 21836fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 21849ae82921SPaul Mullowney { 2185b175d8bbSPaul Mullowney PetscErrorCode ierr; 21869ae82921SPaul Mullowney 21879ae82921SPaul Mullowney PetscFunctionBegin; 2188e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2189e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2190e6e9a74fSStefano Zampini } 2191e6e9a74fSStefano Zampini 2192e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2193e6e9a74fSStefano Zampini { 2194e6e9a74fSStefano Zampini PetscErrorCode ierr; 2195e6e9a74fSStefano Zampini 2196e6e9a74fSStefano Zampini PetscFunctionBegin; 2197e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2198e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2199e6e9a74fSStefano Zampini } 2200e6e9a74fSStefano Zampini 2201e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2202e6e9a74fSStefano Zampini { 2203e6e9a74fSStefano Zampini PetscErrorCode ierr; 2204e6e9a74fSStefano Zampini 2205e6e9a74fSStefano Zampini PetscFunctionBegin; 2206e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2207e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2208e6e9a74fSStefano Zampini } 2209e6e9a74fSStefano Zampini 2210e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2211e6e9a74fSStefano Zampini { 2212e6e9a74fSStefano Zampini PetscErrorCode ierr; 2213e6e9a74fSStefano Zampini 2214e6e9a74fSStefano Zampini PetscFunctionBegin; 2215e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 22169ae82921SPaul Mullowney PetscFunctionReturn(0); 22179ae82921SPaul Mullowney } 22189ae82921SPaul Mullowney 22196fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2220ca45077fSPaul Mullowney { 2221b175d8bbSPaul Mullowney PetscErrorCode ierr; 2222ca45077fSPaul Mullowney 2223ca45077fSPaul Mullowney PetscFunctionBegin; 2224e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2225ca45077fSPaul Mullowney PetscFunctionReturn(0); 2226ca45077fSPaul Mullowney } 2227ca45077fSPaul Mullowney 2228afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2229e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 22309ae82921SPaul Mullowney { 22319ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2232aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 22339ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2234e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2235b175d8bbSPaul Mullowney PetscErrorCode ierr; 223657d48284SJunchao Zhang cudaError_t cerr; 2237aa372e3fSPaul Mullowney cusparseStatus_t stat; 2238e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2239e6e9a74fSStefano Zampini PetscBool compressed; 2240afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2241afb2bd1cSJunchao Zhang PetscInt nx,ny; 2242afb2bd1cSJunchao Zhang #endif 22436e111a19SKarl Rupp 22449ae82921SPaul Mullowney PetscFunctionBegin; 2245e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2246e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2247afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2248d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2249e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2250e6e9a74fSStefano Zampini } 225134d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 225234d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2253e6e9a74fSStefano Zampini if (!trans) { 22549ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2255c9567895SMark if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2256e6e9a74fSStefano Zampini } else { 2257e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 2258e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2259e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2260e6e9a74fSStefano Zampini } else { 2261afb2bd1cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2262e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2263e6e9a74fSStefano Zampini } 2264e6e9a74fSStefano Zampini } 2265e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2266e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2267213423ffSJunchao Zhang 2268e6e9a74fSStefano Zampini try { 2269e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2270213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2271213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2272afb2bd1cSJunchao Zhang 227385ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2274e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2275afb2bd1cSJunchao Zhang /* z = A x + beta y. 2276afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2277afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2278afb2bd1cSJunchao Zhang */ 2279e6e9a74fSStefano Zampini xptr = xarray; 2280afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2281213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2282afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2283afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2284afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2285afb2bd1cSJunchao Zhang */ 2286afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2287afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2288afb2bd1cSJunchao Zhang nx = mat->num_cols; 2289afb2bd1cSJunchao Zhang ny = mat->num_rows; 2290afb2bd1cSJunchao Zhang } 2291afb2bd1cSJunchao Zhang #endif 2292e6e9a74fSStefano Zampini } else { 2293afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2294afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2295afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2296afb2bd1cSJunchao Zhang */ 2297afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2298e6e9a74fSStefano Zampini dptr = zarray; 2299e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2300afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2301e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2302e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2303e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2304e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2305e6e9a74fSStefano Zampini } 2306afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2307afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2308afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2309afb2bd1cSJunchao Zhang nx = mat->num_rows; 2310afb2bd1cSJunchao Zhang ny = mat->num_cols; 2311afb2bd1cSJunchao Zhang } 2312afb2bd1cSJunchao Zhang #endif 2313e6e9a74fSStefano Zampini } 23149ae82921SPaul Mullowney 2315afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2316aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2317afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2319afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2320afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2321afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2322afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2323afb2bd1cSJunchao Zhang matstruct->matDescr, 2324afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2325afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2326afb2bd1cSJunchao Zhang cusparse_scalartype, 2327afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2328afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2329afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2330afb2bd1cSJunchao Zhang 2331afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2332afb2bd1cSJunchao Zhang } else { 2333afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2334afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2335afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2336afb2bd1cSJunchao Zhang } 2337afb2bd1cSJunchao Zhang 2338afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2339afb2bd1cSJunchao Zhang matstruct->alpha_one, 2340afb2bd1cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2341afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2342afb2bd1cSJunchao Zhang beta, 2343afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2344afb2bd1cSJunchao Zhang cusparse_scalartype, 2345afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2346afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2347afb2bd1cSJunchao Zhang #else 23487656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2349e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2350a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2351afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2352aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2353e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 235457d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2355afb2bd1cSJunchao Zhang #endif 2356aa372e3fSPaul Mullowney } else { 2357213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2358afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2359afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2360afb2bd1cSJunchao Zhang #else 2361301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2362e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2363afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2364e6e9a74fSStefano Zampini xptr, beta, 236557d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2366afb2bd1cSJunchao Zhang #endif 2367a65300a6SPaul Mullowney } 2368aa372e3fSPaul Mullowney } 236905035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2370958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2371aa372e3fSPaul Mullowney 2372e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2373213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2374213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2375213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2376e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2377213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 23787656d835SStefano Zampini } 2379213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2380c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 23817656d835SStefano Zampini } 23827656d835SStefano Zampini 2383213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2384213423ffSJunchao Zhang if (compressed) { 2385213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2386e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2387c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2388e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2389c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 239005035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2391958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2392e6e9a74fSStefano Zampini } 2393e6e9a74fSStefano Zampini } else { 2394e6e9a74fSStefano Zampini if (yy && yy != zz) { 2395e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2396e6e9a74fSStefano Zampini } 2397e6e9a74fSStefano Zampini } 2398e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2399213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2400213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 24019ae82921SPaul Mullowney } catch(char *ex) { 24029ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 24039ae82921SPaul Mullowney } 2404e6e9a74fSStefano Zampini if (yy) { 2405958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2406e6e9a74fSStefano Zampini } else { 2407e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2408e6e9a74fSStefano Zampini } 24099ae82921SPaul Mullowney PetscFunctionReturn(0); 24109ae82921SPaul Mullowney } 24119ae82921SPaul Mullowney 24126fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2413ca45077fSPaul Mullowney { 2414b175d8bbSPaul Mullowney PetscErrorCode ierr; 24156e111a19SKarl Rupp 2416ca45077fSPaul Mullowney PetscFunctionBegin; 2417e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2418ca45077fSPaul Mullowney PetscFunctionReturn(0); 2419ca45077fSPaul Mullowney } 2420ca45077fSPaul Mullowney 24216fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 24229ae82921SPaul Mullowney { 24239ae82921SPaul Mullowney PetscErrorCode ierr; 2424a587d139SMark PetscSplitCSRDataStructure *d_mat = NULL; 24259ae82921SPaul Mullowney PetscFunctionBegin; 2426bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 24273fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2428bc3f50f2SPaul Mullowney } 24293fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 24303fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2431a587d139SMark if (d_mat) { 24323fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 24333fa6b06aSMark Adams } 24343fa6b06aSMark Adams 24359ae82921SPaul Mullowney PetscFunctionReturn(0); 24369ae82921SPaul Mullowney } 24379ae82921SPaul Mullowney 24389ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2439e057df02SPaul Mullowney /*@ 24409ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2441e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2442e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2443e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2444e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2445e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 24469ae82921SPaul Mullowney 2447d083f849SBarry Smith Collective 24489ae82921SPaul Mullowney 24499ae82921SPaul Mullowney Input Parameters: 24509ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 24519ae82921SPaul Mullowney . m - number of rows 24529ae82921SPaul Mullowney . n - number of columns 24539ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 24549ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 24550298fd71SBarry Smith (possibly different for each row) or NULL 24569ae82921SPaul Mullowney 24579ae82921SPaul Mullowney Output Parameter: 24589ae82921SPaul Mullowney . A - the matrix 24599ae82921SPaul Mullowney 24609ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 24619ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 24629ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 24639ae82921SPaul Mullowney 24649ae82921SPaul Mullowney Notes: 24659ae82921SPaul Mullowney If nnz is given then nz is ignored 24669ae82921SPaul Mullowney 24679ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 24689ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 24699ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 24709ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 24719ae82921SPaul Mullowney 24729ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 24730298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 24749ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 24759ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 24769ae82921SPaul Mullowney 24779ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 24789ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 24799ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 24809ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 24819ae82921SPaul Mullowney 24829ae82921SPaul Mullowney Level: intermediate 24839ae82921SPaul Mullowney 2484e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 24859ae82921SPaul Mullowney @*/ 24869ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 24879ae82921SPaul Mullowney { 24889ae82921SPaul Mullowney PetscErrorCode ierr; 24899ae82921SPaul Mullowney 24909ae82921SPaul Mullowney PetscFunctionBegin; 24919ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 24929ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 24939ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 24949ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 24959ae82921SPaul Mullowney PetscFunctionReturn(0); 24969ae82921SPaul Mullowney } 24979ae82921SPaul Mullowney 24986fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 24999ae82921SPaul Mullowney { 25009ae82921SPaul Mullowney PetscErrorCode ierr; 25013fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2502ab25e6cbSDominic Meiser 25039ae82921SPaul Mullowney PetscFunctionBegin; 25049ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 25053fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 25063fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2507470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 25089ae82921SPaul Mullowney } else { 2509470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2510aa372e3fSPaul Mullowney } 25113fa6b06aSMark Adams if (d_mat) { 25123fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 25133fa6b06aSMark Adams cudaError_t err; 25143fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 25153fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 25163fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 25173fa6b06aSMark Adams if (a->compressedrow.use) { 25183fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 25193fa6b06aSMark Adams } 25203fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 25213fa6b06aSMark Adams } 2522ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2523ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2524ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2525ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 25267e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 25277e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 25289ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 25299ae82921SPaul Mullowney PetscFunctionReturn(0); 25309ae82921SPaul Mullowney } 25319ae82921SPaul Mullowney 2532ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 253395639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 25349ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 25359ff858a8SKarl Rupp { 25369ff858a8SKarl Rupp PetscErrorCode ierr; 25379ff858a8SKarl Rupp 25389ff858a8SKarl Rupp PetscFunctionBegin; 25399ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2540ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 25419ff858a8SKarl Rupp PetscFunctionReturn(0); 25429ff858a8SKarl Rupp } 25439ff858a8SKarl Rupp 2544a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 254595639643SRichard Tran Mills { 2546e6e9a74fSStefano Zampini PetscErrorCode ierr; 2547a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 2548a587d139SMark PetscBool flgx,flgy; 2549e6e9a74fSStefano Zampini 255095639643SRichard Tran Mills PetscFunctionBegin; 2551a587d139SMark if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 2552a587d139SMark PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 2553a587d139SMark PetscValidHeaderSpecific(X,MAT_CLASSID,3); 2554a587d139SMark ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 2555a587d139SMark ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 2556a587d139SMark if (!flgx || !flgy) { 2557a587d139SMark ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 2558a587d139SMark PetscFunctionReturn(0); 255995639643SRichard Tran Mills } 2560a587d139SMark if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 2561a587d139SMark if (str == DIFFERENT_NONZERO_PATTERN) { 2562a587d139SMark if (x->nz == y->nz) { 2563a587d139SMark PetscBool e; 2564a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 2565a587d139SMark if (e) { 2566a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 2567a587d139SMark if (e) { 2568a587d139SMark str = SAME_NONZERO_PATTERN; 2569a587d139SMark } 2570a587d139SMark } 2571a587d139SMark } 2572a587d139SMark } 2573a587d139SMark if (str != SAME_NONZERO_PATTERN) { 2574a587d139SMark ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 2575a587d139SMark PetscFunctionReturn(0); 2576a587d139SMark } else { 2577a587d139SMark Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 2578a587d139SMark Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 2579a587d139SMark if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 2580a587d139SMark if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 2581a587d139SMark if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 2582a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 2583a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 2584a587d139SMark } 2585a587d139SMark if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 2586a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 2587a587d139SMark } 2588a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2589a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 2590a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 2591a587d139SMark } else { 2592a587d139SMark cublasHandle_t cublasv2handle; 2593a587d139SMark cublasStatus_t cberr; 2594a587d139SMark cudaError_t err; 2595a587d139SMark PetscScalar alpha = a; 2596a587d139SMark PetscBLASInt one = 1, bnz = 1; 2597a587d139SMark CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 2598a587d139SMark CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 2599a587d139SMark PetscScalar *aa_y, *aa_x; 2600a587d139SMark aa_y = matrix_y->values->data().get(); 2601a587d139SMark aa_x = matrix_x->values->data().get(); 2602a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2603a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 2604a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2605a587d139SMark cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 2606a587d139SMark err = WaitForCUDA();CHKERRCUDA(err); 2607a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 2608a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2609a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 2610a587d139SMark ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 2611a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 2612a587d139SMark else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 2613a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 2614a587d139SMark } 2615a587d139SMark } 261695639643SRichard Tran Mills PetscFunctionReturn(0); 261795639643SRichard Tran Mills } 261895639643SRichard Tran Mills 26193fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 26203fa6b06aSMark Adams { 26213fa6b06aSMark Adams PetscErrorCode ierr; 26227e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 2623a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 26247e8381f9SStefano Zampini 26253fa6b06aSMark Adams PetscFunctionBegin; 26263fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 26273fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 26287e8381f9SStefano Zampini if (spptr->mat) { 26297e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 26307e8381f9SStefano Zampini if (matrix->values) { 26317e8381f9SStefano Zampini both = PETSC_TRUE; 26327e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 26337e8381f9SStefano Zampini } 26347e8381f9SStefano Zampini } 26357e8381f9SStefano Zampini if (spptr->matTranspose) { 26367e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 26377e8381f9SStefano Zampini if (matrix->values) { 26387e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 26397e8381f9SStefano Zampini } 26407e8381f9SStefano Zampini } 26413fa6b06aSMark Adams } 2642a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2643a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 2644a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 26457e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2646a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 26473fa6b06aSMark Adams 26483fa6b06aSMark Adams PetscFunctionReturn(0); 26493fa6b06aSMark Adams } 26503fa6b06aSMark Adams 2651a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2652a587d139SMark { 2653a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2654a587d139SMark PetscErrorCode ierr; 2655a587d139SMark 2656a587d139SMark PetscFunctionBegin; 2657a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2658a587d139SMark if (flg) { 2659a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2660a587d139SMark 2661a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 2662a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 2663a587d139SMark A->ops->mult = MatMult_SeqAIJ; 2664a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 2665a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2666a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2667a587d139SMark A->ops->multhermitiantranspose = NULL; 2668a587d139SMark A->ops->multhermitiantransposeadd = NULL; 2669a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2670a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2671a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2672a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2673a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2674a587d139SMark } else { 2675a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 2676a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2677a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 2678a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2679a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2680a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2681a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2682a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2683a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2684a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2685a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2686a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2687a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2688a587d139SMark } 2689a587d139SMark A->boundtocpu = flg; 2690a587d139SMark a->inode.use = flg; 2691a587d139SMark PetscFunctionReturn(0); 2692a587d139SMark } 2693a587d139SMark 269449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 26959ae82921SPaul Mullowney { 26969ae82921SPaul Mullowney PetscErrorCode ierr; 2697aa372e3fSPaul Mullowney cusparseStatus_t stat; 269849735bf3SStefano Zampini Mat B; 26999ae82921SPaul Mullowney 27009ae82921SPaul Mullowney PetscFunctionBegin; 2701*832b2c02SStefano Zampini ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 270249735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 270349735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 270449735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 270549735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 270649735bf3SStefano Zampini } 270749735bf3SStefano Zampini B = *newmat; 270849735bf3SStefano Zampini 270934136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 271034136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 271134136279SStefano Zampini 271249735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 27139ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 2714e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 2715e6e9a74fSStefano Zampini 2716e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2717e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 2718e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2719e6e9a74fSStefano Zampini B->spptr = spptr; 27203fa6b06aSMark Adams spptr->deviceMat = NULL; 27219ae82921SPaul Mullowney } else { 2722e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 2723e6e9a74fSStefano Zampini 2724e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2725e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2726e6e9a74fSStefano Zampini B->spptr = spptr; 27279ae82921SPaul Mullowney } 2728e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 272949735bf3SStefano Zampini } 2730693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 27319ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 27329ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 273395639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2734693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 27352205254eSKarl Rupp 2736e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 27379ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2738bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 27399ae82921SPaul Mullowney PetscFunctionReturn(0); 27409ae82921SPaul Mullowney } 27419ae82921SPaul Mullowney 274202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 274302fe1965SBarry Smith { 274402fe1965SBarry Smith PetscErrorCode ierr; 274502fe1965SBarry Smith 274602fe1965SBarry Smith PetscFunctionBegin; 274702fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 27480ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2749afb2bd1cSJunchao Zhang ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2750afb2bd1cSJunchao Zhang ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2751afb2bd1cSJunchao Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 275202fe1965SBarry Smith PetscFunctionReturn(0); 275302fe1965SBarry Smith } 275402fe1965SBarry Smith 27553ca39a21SBarry Smith /*MC 2756e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2757e057df02SPaul Mullowney 2758e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 27592692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 27602692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2761e057df02SPaul Mullowney 2762e057df02SPaul Mullowney Options Database Keys: 2763e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2764aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2765a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2766e057df02SPaul Mullowney 2767e057df02SPaul Mullowney Level: beginner 2768e057df02SPaul Mullowney 27698468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2770e057df02SPaul Mullowney M*/ 27717f756511SDominic Meiser 277242c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 277342c9c57cSBarry Smith 27740f39cd5aSBarry Smith 27753ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 277642c9c57cSBarry Smith { 277742c9c57cSBarry Smith PetscErrorCode ierr; 277842c9c57cSBarry Smith 277942c9c57cSBarry Smith PetscFunctionBegin; 27803ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 27813ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 27823ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 27833ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 278442c9c57cSBarry Smith PetscFunctionReturn(0); 278542c9c57cSBarry Smith } 278629b38603SBarry Smith 2787470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 27887f756511SDominic Meiser { 2789e6e9a74fSStefano Zampini PetscErrorCode ierr; 27907f756511SDominic Meiser cusparseStatus_t stat; 27917f756511SDominic Meiser 27927f756511SDominic Meiser PetscFunctionBegin; 27937f756511SDominic Meiser if (*cusparsestruct) { 2794e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2795e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 27967f756511SDominic Meiser delete (*cusparsestruct)->workVector; 279781902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 27987e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 27997e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 28007e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_v; 28017e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_w; 28027e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2803afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2804afb2bd1cSJunchao Zhang cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2805afb2bd1cSJunchao Zhang #endif 2806e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 28077f756511SDominic Meiser } 28087f756511SDominic Meiser PetscFunctionReturn(0); 28097f756511SDominic Meiser } 28107f756511SDominic Meiser 28117f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 28127f756511SDominic Meiser { 28137f756511SDominic Meiser PetscFunctionBegin; 28147f756511SDominic Meiser if (*mat) { 28157f756511SDominic Meiser delete (*mat)->values; 28167f756511SDominic Meiser delete (*mat)->column_indices; 28177f756511SDominic Meiser delete (*mat)->row_offsets; 28187f756511SDominic Meiser delete *mat; 28197f756511SDominic Meiser *mat = 0; 28207f756511SDominic Meiser } 28217f756511SDominic Meiser PetscFunctionReturn(0); 28227f756511SDominic Meiser } 28237f756511SDominic Meiser 2824470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 28257f756511SDominic Meiser { 28267f756511SDominic Meiser cusparseStatus_t stat; 28277f756511SDominic Meiser PetscErrorCode ierr; 28287f756511SDominic Meiser 28297f756511SDominic Meiser PetscFunctionBegin; 28307f756511SDominic Meiser if (*trifactor) { 283157d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2832afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 28337f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 28341b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 28352cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 2836afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 28371b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2838afb2bd1cSJunchao Zhang #endif 2839da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 28407f756511SDominic Meiser } 28417f756511SDominic Meiser PetscFunctionReturn(0); 28427f756511SDominic Meiser } 28437f756511SDominic Meiser 2844470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 28457f756511SDominic Meiser { 28467f756511SDominic Meiser CsrMatrix *mat; 28477f756511SDominic Meiser cusparseStatus_t stat; 28487f756511SDominic Meiser cudaError_t err; 28497f756511SDominic Meiser 28507f756511SDominic Meiser PetscFunctionBegin; 28517f756511SDominic Meiser if (*matstruct) { 28527f756511SDominic Meiser if ((*matstruct)->mat) { 28537f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2854afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2855afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2856afb2bd1cSJunchao Zhang #else 28577f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 285857d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2859afb2bd1cSJunchao Zhang #endif 28607f756511SDominic Meiser } else { 28617f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 28627f756511SDominic Meiser CsrMatrix_Destroy(&mat); 28637f756511SDominic Meiser } 28647f756511SDominic Meiser } 286557d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 28667f756511SDominic Meiser delete (*matstruct)->cprowIndices; 2867afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 28687656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 28697656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2870afb2bd1cSJunchao Zhang 2871afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2872afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2873afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2874afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 2875afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 2876afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2877afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2878afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2879afb2bd1cSJunchao Zhang } 2880afb2bd1cSJunchao Zhang } 2881afb2bd1cSJunchao Zhang #endif 28827f756511SDominic Meiser delete *matstruct; 28837e8381f9SStefano Zampini *matstruct = NULL; 28847f756511SDominic Meiser } 28857f756511SDominic Meiser PetscFunctionReturn(0); 28867f756511SDominic Meiser } 28877f756511SDominic Meiser 2888ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 28897f756511SDominic Meiser { 2890e6e9a74fSStefano Zampini PetscErrorCode ierr; 2891e6e9a74fSStefano Zampini 28927f756511SDominic Meiser PetscFunctionBegin; 28937f756511SDominic Meiser if (*trifactors) { 2894e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2895e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2896e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2897e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 28987f756511SDominic Meiser delete (*trifactors)->rpermIndices; 28997f756511SDominic Meiser delete (*trifactors)->cpermIndices; 29007f756511SDominic Meiser delete (*trifactors)->workVector; 29017e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 29027e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 29037e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 2904ccdfe979SStefano Zampini } 2905ccdfe979SStefano Zampini PetscFunctionReturn(0); 2906ccdfe979SStefano Zampini } 2907ccdfe979SStefano Zampini 2908ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2909ccdfe979SStefano Zampini { 2910e6e9a74fSStefano Zampini PetscErrorCode ierr; 2911ccdfe979SStefano Zampini cusparseHandle_t handle; 2912ccdfe979SStefano Zampini cusparseStatus_t stat; 2913ccdfe979SStefano Zampini 2914ccdfe979SStefano Zampini PetscFunctionBegin; 2915ccdfe979SStefano Zampini if (*trifactors) { 2916e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 29177f756511SDominic Meiser if (handle = (*trifactors)->handle) { 291857d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 29197f756511SDominic Meiser } 2920e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 29217f756511SDominic Meiser } 29227f756511SDominic Meiser PetscFunctionReturn(0); 29237f756511SDominic Meiser } 29247e8381f9SStefano Zampini 29257e8381f9SStefano Zampini struct IJCompare 29267e8381f9SStefano Zampini { 29277e8381f9SStefano Zampini __host__ __device__ 29287e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 29297e8381f9SStefano Zampini { 29307e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 29317e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 29327e8381f9SStefano Zampini return false; 29337e8381f9SStefano Zampini } 29347e8381f9SStefano Zampini }; 29357e8381f9SStefano Zampini 29367e8381f9SStefano Zampini struct IJEqual 29377e8381f9SStefano Zampini { 29387e8381f9SStefano Zampini __host__ __device__ 29397e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 29407e8381f9SStefano Zampini { 29417e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 29427e8381f9SStefano Zampini return true; 29437e8381f9SStefano Zampini } 29447e8381f9SStefano Zampini }; 29457e8381f9SStefano Zampini 29467e8381f9SStefano Zampini struct IJDiff 29477e8381f9SStefano Zampini { 29487e8381f9SStefano Zampini __host__ __device__ 29497e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 29507e8381f9SStefano Zampini { 29517e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 29527e8381f9SStefano Zampini } 29537e8381f9SStefano Zampini }; 29547e8381f9SStefano Zampini 29557e8381f9SStefano Zampini struct IJSum 29567e8381f9SStefano Zampini { 29577e8381f9SStefano Zampini __host__ __device__ 29587e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 29597e8381f9SStefano Zampini { 29607e8381f9SStefano Zampini return t1||t2; 29617e8381f9SStefano Zampini } 29627e8381f9SStefano Zampini }; 29637e8381f9SStefano Zampini 29647e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 29657e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 29667e8381f9SStefano Zampini { 29677e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 29687e8381f9SStefano Zampini CsrMatrix *matrix; 29697e8381f9SStefano Zampini PetscErrorCode ierr; 29707e8381f9SStefano Zampini cudaError_t cerr; 29717e8381f9SStefano Zampini PetscInt n; 29727e8381f9SStefano Zampini 29737e8381f9SStefano Zampini PetscFunctionBegin; 29747e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 29757e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 29767e8381f9SStefano Zampini if (!cusp->cooPerm) { 29777e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 29787e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 29797e8381f9SStefano Zampini PetscFunctionReturn(0); 29807e8381f9SStefano Zampini } 29817e8381f9SStefano Zampini n = cusp->cooPerm->size(); 29827e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 29837e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 29847e8381f9SStefano Zampini if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 29857e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 29867e8381f9SStefano Zampini if (v) { 29877e8381f9SStefano Zampini cusp->cooPerm_v->assign(v,v+n); 29887e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 29897e8381f9SStefano Zampini } 29907e8381f9SStefano Zampini else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 29917e8381f9SStefano Zampini if (imode == ADD_VALUES) { 29927e8381f9SStefano Zampini if (cusp->cooPerm_a) { 29937e8381f9SStefano Zampini if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 29947e8381f9SStefano Zampini auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 29957e8381f9SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 29967e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 29977e8381f9SStefano Zampini } else { 29987e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 29997e8381f9SStefano Zampini matrix->values->begin())); 30007e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 30017e8381f9SStefano Zampini matrix->values->end())); 30027e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 30037e8381f9SStefano Zampini } 30047e8381f9SStefano Zampini } else { 30057e8381f9SStefano Zampini if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 30067e8381f9SStefano Zampini if we are inserting two different values into the same location */ 30077e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 30087e8381f9SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 30097e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 30107e8381f9SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 30117e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 30127e8381f9SStefano Zampini } else { 30137e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 30147e8381f9SStefano Zampini matrix->values->begin())); 30157e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 30167e8381f9SStefano Zampini matrix->values->end())); 30177e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 30187e8381f9SStefano Zampini } 30197e8381f9SStefano Zampini } 30207e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 30217e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 30227e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 30237e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 30247e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 30257e8381f9SStefano Zampini /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 30267e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 30277e8381f9SStefano Zampini PetscFunctionReturn(0); 30287e8381f9SStefano Zampini } 30297e8381f9SStefano Zampini 30307e8381f9SStefano Zampini #include <thrust/binary_search.h> 30317e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 30327e8381f9SStefano Zampini { 30337e8381f9SStefano Zampini PetscErrorCode ierr; 30347e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 30357e8381f9SStefano Zampini CsrMatrix *matrix; 30367e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30377e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 30387e8381f9SStefano Zampini cudaError_t cerr; 30397e8381f9SStefano Zampini 30407e8381f9SStefano Zampini PetscFunctionBegin; 30417e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 30427e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 30437e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 30447e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 30457e8381f9SStefano Zampini if (n != cooPerm_n) { 30467e8381f9SStefano Zampini delete cusp->cooPerm; 30477e8381f9SStefano Zampini delete cusp->cooPerm_v; 30487e8381f9SStefano Zampini delete cusp->cooPerm_w; 30497e8381f9SStefano Zampini delete cusp->cooPerm_a; 30507e8381f9SStefano Zampini cusp->cooPerm = NULL; 30517e8381f9SStefano Zampini cusp->cooPerm_v = NULL; 30527e8381f9SStefano Zampini cusp->cooPerm_w = NULL; 30537e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 30547e8381f9SStefano Zampini } 30557e8381f9SStefano Zampini if (n) { 30567e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 30577e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 30587e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 30597e8381f9SStefano Zampini 30607e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 30617e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 30627e8381f9SStefano Zampini 30637e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 30647e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 30657e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 30667e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 30677e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 30687e8381f9SStefano Zampini 30697e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 30707e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 30717e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 30727e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 30737e8381f9SStefano Zampini 30747e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 30757e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 30767e8381f9SStefano Zampini delete cusp->cooPerm_a; 30777e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 30787e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 30797e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 30807e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 30817e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 30827e8381f9SStefano Zampini w[0] = 0; 30837e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 30847e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 30857e8381f9SStefano Zampini } 30867e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 30877e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 30887e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 30897e8381f9SStefano Zampini ii.begin()); 30907e8381f9SStefano Zampini 30917e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 30927e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 30937e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 30947e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 30957e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 30967e8381f9SStefano Zampini a->i[0] = 0; 30977e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 30987e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 30997e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 31007e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 31017e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 31027e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 31037e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 31047e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 31057e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 31067e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 31077e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 31087e8381f9SStefano Zampini } 31097e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 31107e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 31117e8381f9SStefano Zampini } else { 31127e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 31137e8381f9SStefano Zampini } 31147e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 31157e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 31167e8381f9SStefano Zampini 31177e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 31187e8381f9SStefano Zampini The code is so convoluted now that I prefer to copy garbage to the GPU */ 31197e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 31207e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 31217e8381f9SStefano Zampini A->nonzerostate++; 31227e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 31237e8381f9SStefano Zampini { 31247e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 31257e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 31267e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 31277e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 31287e8381f9SStefano Zampini } 31297e8381f9SStefano Zampini 31307e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 31317e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 31327e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 31337e8381f9SStefano Zampini PetscFunctionReturn(0); 31347e8381f9SStefano Zampini } 3135