19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 16e8d2b73aSMark Adams 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 769ae82921SPaul Mullowney 777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 827f756511SDominic Meiser 83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 8657181aedSStefano Zampini 877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 897e8381f9SStefano Zampini 90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91c215019aSStefano Zampini 92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93b06137fdSPaul Mullowney { 94b06137fdSPaul Mullowney cusparseStatus_t stat; 95b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96b06137fdSPaul Mullowney 97b06137fdSPaul Mullowney PetscFunctionBegin; 98d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10057d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101b06137fdSPaul Mullowney PetscFunctionReturn(0); 102b06137fdSPaul Mullowney } 103b06137fdSPaul Mullowney 104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105b06137fdSPaul Mullowney { 106b06137fdSPaul Mullowney cusparseStatus_t stat; 107b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108b06137fdSPaul Mullowney 109b06137fdSPaul Mullowney PetscFunctionBegin; 110d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1116b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11216a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11357d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11416a2e217SAlejandro Lamas Daviña } 115b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1166b1cf21dSAlejandro Lamas Daviña } 11757d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118b06137fdSPaul Mullowney PetscFunctionReturn(0); 119b06137fdSPaul Mullowney } 120b06137fdSPaul Mullowney 121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122b06137fdSPaul Mullowney { 123b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1247e8381f9SStefano Zampini PetscBool flg; 1257e8381f9SStefano Zampini PetscErrorCode ierr; 126ccdfe979SStefano Zampini 127b06137fdSPaul Mullowney PetscFunctionBegin; 1287e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1297e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 131b06137fdSPaul Mullowney PetscFunctionReturn(0); 132b06137fdSPaul Mullowney } 133b06137fdSPaul Mullowney 134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1359ae82921SPaul Mullowney { 1369ae82921SPaul Mullowney PetscFunctionBegin; 1379ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1389ae82921SPaul Mullowney PetscFunctionReturn(0); 1399ae82921SPaul Mullowney } 1409ae82921SPaul Mullowney 141c708e6cdSJed Brown /*MC 142087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 148c708e6cdSJed Brown 1499ae82921SPaul Mullowney Level: beginner 150c708e6cdSJed Brown 1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152c708e6cdSJed Brown M*/ 1539ae82921SPaul Mullowney 15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1559ae82921SPaul Mullowney { 1569ae82921SPaul Mullowney PetscErrorCode ierr; 157bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1589ae82921SPaul Mullowney 1599ae82921SPaul Mullowney PetscFunctionBegin; 160bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1622c7c0729SBarry Smith (*B)->factortype = ftype; 1639ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1642205254eSKarl Rupp 1659c1083e7SRichard Tran Mills if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16733d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1689c1083e7SRichard Tran Mills if (!A->boundtocpu) { 1699ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1709ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1719c1083e7SRichard Tran Mills } else { 1729c1083e7SRichard Tran Mills (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1739c1083e7SRichard Tran Mills (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1749c1083e7SRichard Tran Mills } 1754ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1764ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 1774ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1799c1083e7SRichard Tran Mills if (!A->boundtocpu) { 180087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1829c1083e7SRichard Tran Mills } else { 1839c1083e7SRichard Tran Mills (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1849c1083e7SRichard Tran Mills (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1859c1083e7SRichard Tran Mills } 1864ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 1874ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 1889ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189bc3f50f2SPaul Mullowney 190fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1914ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1923ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1939ae82921SPaul Mullowney PetscFunctionReturn(0); 1949ae82921SPaul Mullowney } 1959ae82921SPaul Mullowney 196bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197ca45077fSPaul Mullowney { 198aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1996e111a19SKarl Rupp 200ca45077fSPaul Mullowney PetscFunctionBegin; 201ca45077fSPaul Mullowney switch (op) { 202e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 203aa372e3fSPaul Mullowney cusparsestruct->format = format; 204ca45077fSPaul Mullowney break; 205e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 206aa372e3fSPaul Mullowney cusparsestruct->format = format; 207ca45077fSPaul Mullowney break; 208ca45077fSPaul Mullowney default: 20936d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210ca45077fSPaul Mullowney } 211ca45077fSPaul Mullowney PetscFunctionReturn(0); 212ca45077fSPaul Mullowney } 2139ae82921SPaul Mullowney 214e057df02SPaul Mullowney /*@ 215e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 217aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 218e057df02SPaul Mullowney Not Collective 219e057df02SPaul Mullowney 220e057df02SPaul Mullowney Input Parameters: 2218468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 22236d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2232692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224e057df02SPaul Mullowney 225e057df02SPaul Mullowney Output Parameter: 226e057df02SPaul Mullowney 227e057df02SPaul Mullowney Level: intermediate 228e057df02SPaul Mullowney 2298468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230e057df02SPaul Mullowney @*/ 231e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232e057df02SPaul Mullowney { 233e057df02SPaul Mullowney PetscErrorCode ierr; 2346e111a19SKarl Rupp 235e057df02SPaul Mullowney PetscFunctionBegin; 236e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238e057df02SPaul Mullowney PetscFunctionReturn(0); 239e057df02SPaul Mullowney } 240e057df02SPaul Mullowney 241365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 242365b711fSMark Adams { 243365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 244365b711fSMark Adams 245365b711fSMark Adams PetscFunctionBegin; 246365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 247365b711fSMark Adams PetscFunctionReturn(0); 248365b711fSMark Adams } 249365b711fSMark Adams 250365b711fSMark Adams /*@ 251365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 252365b711fSMark Adams 253365b711fSMark Adams Input Parameters: 254365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 255365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 256365b711fSMark Adams 257365b711fSMark Adams Output Parameter: 258365b711fSMark Adams 259365b711fSMark Adams Notes: 260365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 261365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 262365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 263365b711fSMark Adams 264365b711fSMark Adams Level: intermediate 265365b711fSMark Adams 266365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 267365b711fSMark Adams @*/ 268365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 269365b711fSMark Adams { 270365b711fSMark Adams PetscErrorCode ierr; 271365b711fSMark Adams 272365b711fSMark Adams PetscFunctionBegin; 273365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 274365b711fSMark Adams ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 275365b711fSMark Adams PetscFunctionReturn(0); 276365b711fSMark Adams } 277365b711fSMark Adams 2781a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 279e6e9a74fSStefano Zampini { 280e6e9a74fSStefano Zampini PetscErrorCode ierr; 281e6e9a74fSStefano Zampini 282e6e9a74fSStefano Zampini PetscFunctionBegin; 2831a2c6b5cSJunchao Zhang switch (op) { 2841a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2851a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2861a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2871a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2881a2c6b5cSJunchao Zhang break; 2891a2c6b5cSJunchao Zhang default: 2901a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2911a2c6b5cSJunchao Zhang break; 292e6e9a74fSStefano Zampini } 293e6e9a74fSStefano Zampini PetscFunctionReturn(0); 294e6e9a74fSStefano Zampini } 295e6e9a74fSStefano Zampini 296bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 297bddcd29dSMark Adams 298bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 299bddcd29dSMark Adams { 300bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 301bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 302bddcd29dSMark Adams PetscBool row_identity,col_identity; 303365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 304bddcd29dSMark Adams PetscErrorCode ierr; 305bddcd29dSMark Adams 306bddcd29dSMark Adams PetscFunctionBegin; 307bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 308bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 309bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 310bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 311bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 312bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 313bddcd29dSMark Adams if (row_identity && col_identity) { 314365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 315bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 316bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 317365b711fSMark Adams } 318bddcd29dSMark Adams B->ops->matsolve = NULL; 319bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 320bddcd29dSMark Adams } else { 321365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 322bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 323bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 324365b711fSMark Adams } 325bddcd29dSMark Adams B->ops->matsolve = NULL; 326bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 327bddcd29dSMark Adams } 328bddcd29dSMark Adams 329bddcd29dSMark Adams /* get the triangular factors */ 330365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 331bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 332365b711fSMark Adams } 333bddcd29dSMark Adams PetscFunctionReturn(0); 334bddcd29dSMark Adams } 335bddcd29dSMark Adams 3364416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 3379ae82921SPaul Mullowney { 3389ae82921SPaul Mullowney PetscErrorCode ierr; 339e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 3409ae82921SPaul Mullowney PetscBool flg; 341a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3426e111a19SKarl Rupp 3439ae82921SPaul Mullowney PetscFunctionBegin; 344e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 3459ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 346e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 347a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 348afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 349afb2bd1cSJunchao Zhang 3504c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 351a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 352afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 353365b711fSMark Adams ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 354365b711fSMark Adams if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 355afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 356afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 357afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 358afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 359a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 360a435da06SStefano Zampini if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 361a435da06SStefano Zampini #else 362afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 363a435da06SStefano Zampini #endif 364afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 365afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 366afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 367afb2bd1cSJunchao Zhang 368afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 369afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 370afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 371afb2bd1cSJunchao Zhang #endif 3724c87dfd4SPaul Mullowney } 3730af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3749ae82921SPaul Mullowney PetscFunctionReturn(0); 3759ae82921SPaul Mullowney } 3769ae82921SPaul Mullowney 3776fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3789ae82921SPaul Mullowney { 379da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3809ae82921SPaul Mullowney PetscErrorCode ierr; 3819ae82921SPaul Mullowney 3829ae82921SPaul Mullowney PetscFunctionBegin; 383da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3849ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3859ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3869ae82921SPaul Mullowney PetscFunctionReturn(0); 3879ae82921SPaul Mullowney } 3889ae82921SPaul Mullowney 3896fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3909ae82921SPaul Mullowney { 391da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3929ae82921SPaul Mullowney PetscErrorCode ierr; 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney PetscFunctionBegin; 395da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3969ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3979ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3989ae82921SPaul Mullowney PetscFunctionReturn(0); 3999ae82921SPaul Mullowney } 4009ae82921SPaul Mullowney 401087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 402087f3262SPaul Mullowney { 403da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 404087f3262SPaul Mullowney PetscErrorCode ierr; 405087f3262SPaul Mullowney 406087f3262SPaul Mullowney PetscFunctionBegin; 407da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 408087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 409087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 410087f3262SPaul Mullowney PetscFunctionReturn(0); 411087f3262SPaul Mullowney } 412087f3262SPaul Mullowney 413087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 414087f3262SPaul Mullowney { 415da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 416087f3262SPaul Mullowney PetscErrorCode ierr; 417087f3262SPaul Mullowney 418087f3262SPaul Mullowney PetscFunctionBegin; 419da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 420087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 421087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 422087f3262SPaul Mullowney PetscFunctionReturn(0); 423087f3262SPaul Mullowney } 424087f3262SPaul Mullowney 425087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 4269ae82921SPaul Mullowney { 4279ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4289ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4299ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 430aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 4319ae82921SPaul Mullowney cusparseStatus_t stat; 4329ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 4339ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4349ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 4359ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 436b175d8bbSPaul Mullowney PetscErrorCode ierr; 43757d48284SJunchao Zhang cudaError_t cerr; 4389ae82921SPaul Mullowney 4399ae82921SPaul Mullowney PetscFunctionBegin; 440cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 441c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4429ae82921SPaul Mullowney try { 4439ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 4449ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 445da79fbbcSStefano Zampini if (!loTriFactor) { 4462cbc15d9SMark PetscScalar *AALo; 4472cbc15d9SMark 4482cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4499ae82921SPaul Mullowney 4509ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 45157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 45257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 4539ae82921SPaul Mullowney 4549ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 4559ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 4569ae82921SPaul Mullowney AiLo[n] = nzLower; 4579ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4589ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4599ae82921SPaul Mullowney v = aa; 4609ae82921SPaul Mullowney vi = aj; 4619ae82921SPaul Mullowney offset = 1; 4629ae82921SPaul Mullowney rowOffset= 1; 4639ae82921SPaul Mullowney for (i=1; i<n; i++) { 4649ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 465e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4669ae82921SPaul Mullowney AiLo[i] = rowOffset; 4679ae82921SPaul Mullowney rowOffset += nz+1; 4689ae82921SPaul Mullowney 469580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 470580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4719ae82921SPaul Mullowney 4729ae82921SPaul Mullowney offset += nz; 4739ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4749ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4759ae82921SPaul Mullowney offset += 1; 4769ae82921SPaul Mullowney 4779ae82921SPaul Mullowney v += nz; 4789ae82921SPaul Mullowney vi += nz; 4799ae82921SPaul Mullowney } 4802205254eSKarl Rupp 481aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 482da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 483da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 484aa372e3fSPaul Mullowney /* Create the matrix description */ 48557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 48657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4871b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 488afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 489afb2bd1cSJunchao Zhang #else 49057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 491afb2bd1cSJunchao Zhang #endif 49257d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 49357d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 494aa372e3fSPaul Mullowney 495aa372e3fSPaul Mullowney /* set the operation */ 496aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 497aa372e3fSPaul Mullowney 498aa372e3fSPaul Mullowney /* set the matrix */ 499aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 500aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 501aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 502aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 503aa372e3fSPaul Mullowney 504aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 505aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 506aa372e3fSPaul Mullowney 507aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 508aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 509aa372e3fSPaul Mullowney 510aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 511aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 512aa372e3fSPaul Mullowney 513afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 514da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 515afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 517afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 518afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 519afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 520afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 521afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 522afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 523afb2bd1cSJunchao Zhang #endif 524afb2bd1cSJunchao Zhang 525aa372e3fSPaul Mullowney /* perform the solve analysis */ 526aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 527aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 528aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 529d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 5301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 531d49cd2b7SBarry Smith loTriFactor->solveInfo, 532d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 533d49cd2b7SBarry Smith #else 534d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 535afb2bd1cSJunchao Zhang #endif 536da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 537da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 538aa372e3fSPaul Mullowney 539da79fbbcSStefano Zampini /* assign the pointer */ 540aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 5412cbc15d9SMark loTriFactor->AA_h = AALo; 54257d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 54357d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 5444863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 545da79fbbcSStefano Zampini } else { /* update values only */ 5462cbc15d9SMark if (!loTriFactor->AA_h) { 5472cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 5482cbc15d9SMark } 549da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 5502cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 551da79fbbcSStefano Zampini v = aa; 552da79fbbcSStefano Zampini vi = aj; 553da79fbbcSStefano Zampini offset = 1; 554da79fbbcSStefano Zampini for (i=1; i<n; i++) { 555da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 5562cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 557da79fbbcSStefano Zampini offset += nz; 5582cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 559da79fbbcSStefano Zampini offset += 1; 560da79fbbcSStefano Zampini v += nz; 561da79fbbcSStefano Zampini } 5622cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 563da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 564da79fbbcSStefano Zampini } 5659ae82921SPaul Mullowney } catch(char *ex) { 5669ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5679ae82921SPaul Mullowney } 5689ae82921SPaul Mullowney } 5699ae82921SPaul Mullowney PetscFunctionReturn(0); 5709ae82921SPaul Mullowney } 5719ae82921SPaul Mullowney 572087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5739ae82921SPaul Mullowney { 5749ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5759ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5769ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 577aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5789ae82921SPaul Mullowney cusparseStatus_t stat; 5799ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5809ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5819ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5829ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5839ae82921SPaul Mullowney PetscErrorCode ierr; 58457d48284SJunchao Zhang cudaError_t cerr; 5859ae82921SPaul Mullowney 5869ae82921SPaul Mullowney PetscFunctionBegin; 587cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 588c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5899ae82921SPaul Mullowney try { 5909ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5919ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 592da79fbbcSStefano Zampini if (!upTriFactor) { 5932cbc15d9SMark PetscScalar *AAUp; 5942cbc15d9SMark 5952cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5962cbc15d9SMark 5979ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 59857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 59957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 6009ae82921SPaul Mullowney 6019ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 6029ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 6039ae82921SPaul Mullowney AiUp[n]=nzUpper; 6049ae82921SPaul Mullowney offset = nzUpper; 6059ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 6069ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 6079ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 6089ae82921SPaul Mullowney 609e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 6109ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 6119ae82921SPaul Mullowney 612e057df02SPaul Mullowney /* decrement the offset */ 6139ae82921SPaul Mullowney offset -= (nz+1); 6149ae82921SPaul Mullowney 615e057df02SPaul Mullowney /* first, set the diagonal elements */ 6169ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 61709f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 6189ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 6199ae82921SPaul Mullowney 620580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 621580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 6229ae82921SPaul Mullowney } 6232205254eSKarl Rupp 624aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 625da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 626da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 6272205254eSKarl Rupp 628aa372e3fSPaul Mullowney /* Create the matrix description */ 62957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 63057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 6311b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 632afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 633afb2bd1cSJunchao Zhang #else 63457d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 635afb2bd1cSJunchao Zhang #endif 63657d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 63757d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 638aa372e3fSPaul Mullowney 639aa372e3fSPaul Mullowney /* set the operation */ 640aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 641aa372e3fSPaul Mullowney 642aa372e3fSPaul Mullowney /* set the matrix */ 643aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 644aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 645aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 646aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 647aa372e3fSPaul Mullowney 648aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 649aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 650aa372e3fSPaul Mullowney 651aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 652aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 653aa372e3fSPaul Mullowney 654aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 655aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 656aa372e3fSPaul Mullowney 657afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 658da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 659afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 6601b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 661afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 662afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 663afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 664afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 665afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 666afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 667afb2bd1cSJunchao Zhang #endif 668afb2bd1cSJunchao Zhang 669aa372e3fSPaul Mullowney /* perform the solve analysis */ 670aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 671aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 672aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 673d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 675d49cd2b7SBarry Smith upTriFactor->solveInfo, 676d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 677d49cd2b7SBarry Smith #else 678d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 679afb2bd1cSJunchao Zhang #endif 680da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 681da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 682aa372e3fSPaul Mullowney 683da79fbbcSStefano Zampini /* assign the pointer */ 684aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6852cbc15d9SMark upTriFactor->AA_h = AAUp; 68657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 68757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6884863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 689da79fbbcSStefano Zampini } else { 6902cbc15d9SMark if (!upTriFactor->AA_h) { 6912cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6922cbc15d9SMark } 693da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 694da79fbbcSStefano Zampini offset = nzUpper; 695da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 696da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 697da79fbbcSStefano Zampini 698da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 699da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 700da79fbbcSStefano Zampini 701da79fbbcSStefano Zampini /* decrement the offset */ 702da79fbbcSStefano Zampini offset -= (nz+1); 703da79fbbcSStefano Zampini 704da79fbbcSStefano Zampini /* first, set the diagonal elements */ 7052cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 7062cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 707da79fbbcSStefano Zampini } 7082cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 709da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 710da79fbbcSStefano Zampini } 7119ae82921SPaul Mullowney } catch(char *ex) { 7129ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7139ae82921SPaul Mullowney } 7149ae82921SPaul Mullowney } 7159ae82921SPaul Mullowney PetscFunctionReturn(0); 7169ae82921SPaul Mullowney } 7179ae82921SPaul Mullowney 718087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 7199ae82921SPaul Mullowney { 7209ae82921SPaul Mullowney PetscErrorCode ierr; 7219ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 7229ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7239ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 7249ae82921SPaul Mullowney PetscBool row_identity,col_identity; 7259ae82921SPaul Mullowney PetscInt n = A->rmap->n; 7269ae82921SPaul Mullowney 7279ae82921SPaul Mullowney PetscFunctionBegin; 728da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 729087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 730087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 7312205254eSKarl Rupp 732da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 733aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 7349ae82921SPaul Mullowney 735c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 736e057df02SPaul Mullowney /* lower triangular indices */ 7379ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 738da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 739da79fbbcSStefano Zampini const PetscInt *r; 740da79fbbcSStefano Zampini 741da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 742aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 743aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 7449ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 745da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 746da79fbbcSStefano Zampini } 7479ae82921SPaul Mullowney 748e057df02SPaul Mullowney /* upper triangular indices */ 7499ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 750da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 751da79fbbcSStefano Zampini const PetscInt *c; 752da79fbbcSStefano Zampini 753da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 754aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 755aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 7569ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 757da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 758da79fbbcSStefano Zampini } 7599ae82921SPaul Mullowney PetscFunctionReturn(0); 7609ae82921SPaul Mullowney } 7619ae82921SPaul Mullowney 762087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 763087f3262SPaul Mullowney { 764087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 765087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 766aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 767aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 768087f3262SPaul Mullowney cusparseStatus_t stat; 769087f3262SPaul Mullowney PetscErrorCode ierr; 77057d48284SJunchao Zhang cudaError_t cerr; 771087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 772087f3262SPaul Mullowney PetscScalar *AAUp; 773087f3262SPaul Mullowney PetscScalar *AALo; 774087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 775087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 776087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 777087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 778087f3262SPaul Mullowney 779087f3262SPaul Mullowney PetscFunctionBegin; 780cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 781c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 782087f3262SPaul Mullowney try { 783da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 784da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 785da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 786087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 78757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 78857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 789087f3262SPaul Mullowney 790087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 791087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 792087f3262SPaul Mullowney AiUp[n]=nzUpper; 793087f3262SPaul Mullowney offset = 0; 794087f3262SPaul Mullowney for (i=0; i<n; i++) { 795087f3262SPaul Mullowney /* set the pointers */ 796087f3262SPaul Mullowney v = aa + ai[i]; 797087f3262SPaul Mullowney vj = aj + ai[i]; 798087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 799087f3262SPaul Mullowney 800087f3262SPaul Mullowney /* first, set the diagonal elements */ 801087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 80209f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 803087f3262SPaul Mullowney AiUp[i] = offset; 80409f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 805087f3262SPaul Mullowney 806087f3262SPaul Mullowney offset+=1; 807087f3262SPaul Mullowney if (nz>0) { 808f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 809580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 810087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 811087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 812087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 813087f3262SPaul Mullowney } 814087f3262SPaul Mullowney offset+=nz; 815087f3262SPaul Mullowney } 816087f3262SPaul Mullowney } 817087f3262SPaul Mullowney 818aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 819da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 820da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 821087f3262SPaul Mullowney 822aa372e3fSPaul Mullowney /* Create the matrix description */ 82357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 82457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8251b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 826afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 827afb2bd1cSJunchao Zhang #else 82857d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 829afb2bd1cSJunchao Zhang #endif 83057d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 83157d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 832087f3262SPaul Mullowney 833aa372e3fSPaul Mullowney /* set the matrix */ 834aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 835aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 836aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 837aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 838aa372e3fSPaul Mullowney 839aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841aa372e3fSPaul Mullowney 842aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 847aa372e3fSPaul Mullowney 848afb2bd1cSJunchao Zhang /* set the operation */ 849afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 850afb2bd1cSJunchao Zhang 851afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 852da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 853afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 855afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 856afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 857afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 858afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 859afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 860afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 861afb2bd1cSJunchao Zhang #endif 862afb2bd1cSJunchao Zhang 863aa372e3fSPaul Mullowney /* perform the solve analysis */ 864aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 865aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 866aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 867d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 869d49cd2b7SBarry Smith upTriFactor->solveInfo, 870d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 871d49cd2b7SBarry Smith #else 872d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 873afb2bd1cSJunchao Zhang #endif 874da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 875da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 876aa372e3fSPaul Mullowney 877da79fbbcSStefano Zampini /* assign the pointer */ 878aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 879aa372e3fSPaul Mullowney 880aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 881da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 882da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 883aa372e3fSPaul Mullowney 884aa372e3fSPaul Mullowney /* Create the matrix description */ 88557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 88657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8871b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 888afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 889afb2bd1cSJunchao Zhang #else 89057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 891afb2bd1cSJunchao Zhang #endif 89257d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 89357d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 894aa372e3fSPaul Mullowney 895aa372e3fSPaul Mullowney /* set the operation */ 896aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 897aa372e3fSPaul Mullowney 898aa372e3fSPaul Mullowney /* set the matrix */ 899aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 900aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 901aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 902aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 903aa372e3fSPaul Mullowney 904aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 906aa372e3fSPaul Mullowney 907aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 908aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 909aa372e3fSPaul Mullowney 910aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 911aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 912aa372e3fSPaul Mullowney 913afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 914da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 915afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 9161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 917afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 918afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 919afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 920afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 921afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 922afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 923afb2bd1cSJunchao Zhang #endif 924afb2bd1cSJunchao Zhang 925aa372e3fSPaul Mullowney /* perform the solve analysis */ 926aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 927aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 928aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 929d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 9301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 931d49cd2b7SBarry Smith loTriFactor->solveInfo, 932d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 933d49cd2b7SBarry Smith #else 934d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 935afb2bd1cSJunchao Zhang #endif 936da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 937da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 938aa372e3fSPaul Mullowney 939da79fbbcSStefano Zampini /* assign the pointer */ 940aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 941087f3262SPaul Mullowney 942da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 94357d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 94457d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 945da79fbbcSStefano Zampini } else { 946da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 947da79fbbcSStefano Zampini offset = 0; 948da79fbbcSStefano Zampini for (i=0; i<n; i++) { 949da79fbbcSStefano Zampini /* set the pointers */ 950da79fbbcSStefano Zampini v = aa + ai[i]; 951da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 952da79fbbcSStefano Zampini 953da79fbbcSStefano Zampini /* first, set the diagonal elements */ 954da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 955da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 956da79fbbcSStefano Zampini 957da79fbbcSStefano Zampini offset+=1; 958da79fbbcSStefano Zampini if (nz>0) { 959da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 960da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 961da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 962da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 963da79fbbcSStefano Zampini } 964da79fbbcSStefano Zampini offset+=nz; 965da79fbbcSStefano Zampini } 966da79fbbcSStefano Zampini } 967da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 968da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 969da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 970da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 971da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 972da79fbbcSStefano Zampini } 97357d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 97457d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 975087f3262SPaul Mullowney } catch(char *ex) { 976087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 977087f3262SPaul Mullowney } 978087f3262SPaul Mullowney } 979087f3262SPaul Mullowney PetscFunctionReturn(0); 980087f3262SPaul Mullowney } 981087f3262SPaul Mullowney 982087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9839ae82921SPaul Mullowney { 9849ae82921SPaul Mullowney PetscErrorCode ierr; 985087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 986087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 987087f3262SPaul Mullowney IS ip = a->row; 988087f3262SPaul Mullowney PetscBool perm_identity; 989087f3262SPaul Mullowney PetscInt n = A->rmap->n; 990087f3262SPaul Mullowney 991087f3262SPaul Mullowney PetscFunctionBegin; 992da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 993087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 994da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 995aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 996aa372e3fSPaul Mullowney 997da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 998da79fbbcSStefano Zampini 999087f3262SPaul Mullowney /* lower triangular indices */ 1000087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1001087f3262SPaul Mullowney if (!perm_identity) { 10024e4bbfaaSStefano Zampini IS iip; 1003da79fbbcSStefano Zampini const PetscInt *irip,*rip; 10044e4bbfaaSStefano Zampini 10054e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 10064e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1007da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1008aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1009aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1010aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 10114e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 10124e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 10134e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 1014087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1015da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1016da79fbbcSStefano Zampini } 1017087f3262SPaul Mullowney PetscFunctionReturn(0); 1018087f3262SPaul Mullowney } 1019087f3262SPaul Mullowney 1020087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1021087f3262SPaul Mullowney { 1022087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1023087f3262SPaul Mullowney IS ip = b->row; 1024087f3262SPaul Mullowney PetscBool perm_identity; 1025b175d8bbSPaul Mullowney PetscErrorCode ierr; 1026087f3262SPaul Mullowney 1027087f3262SPaul Mullowney PetscFunctionBegin; 102857181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1029087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1030ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1031087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 1032087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1033087f3262SPaul Mullowney if (perm_identity) { 1034087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1035087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 10364e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10374e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1038087f3262SPaul Mullowney } else { 1039087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1040087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 10414e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10424e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1043087f3262SPaul Mullowney } 1044087f3262SPaul Mullowney 1045087f3262SPaul Mullowney /* get the triangular factors */ 1046087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1047087f3262SPaul Mullowney PetscFunctionReturn(0); 1048087f3262SPaul Mullowney } 10499ae82921SPaul Mullowney 1050b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1051bda325fcSPaul Mullowney { 1052bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1053aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1054aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1055da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1056da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1057bda325fcSPaul Mullowney cusparseStatus_t stat; 1058aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1059aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1060aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1061aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10621b0a6780SStefano Zampini cudaError_t cerr; 1063da79fbbcSStefano Zampini PetscErrorCode ierr; 1064b175d8bbSPaul Mullowney 1065bda325fcSPaul Mullowney PetscFunctionBegin; 1066aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1067da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1068da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1069aa372e3fSPaul Mullowney 1070aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1071aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1072aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1073aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1074aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1075aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1076aa372e3fSPaul Mullowney 1077aa372e3fSPaul Mullowney /* Create the matrix description */ 107857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 107957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 108057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 108157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 108257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1083aa372e3fSPaul Mullowney 1084aa372e3fSPaul Mullowney /* set the operation */ 1085aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1086aa372e3fSPaul Mullowney 1087aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1088aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1089afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1090afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1091aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1092afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1093afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1094afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1095aa372e3fSPaul Mullowney 1096aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1097afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1098afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1099afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1100afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1101afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1102afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1103afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1104afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1105afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1106afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 11071b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1108afb2bd1cSJunchao Zhang #endif 1109afb2bd1cSJunchao Zhang 1110da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1111aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1112aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1113aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1114aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1115aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1116aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1117afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1118afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1119afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1120d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1121afb2bd1cSJunchao Zhang #else 1122afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1123d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1124afb2bd1cSJunchao Zhang #endif 1125da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1126da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1127aa372e3fSPaul Mullowney 1128afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1129da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1130afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11311b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1132afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1133afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1134afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1135afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1136afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1137afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1138afb2bd1cSJunchao Zhang #endif 1139afb2bd1cSJunchao Zhang 1140afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1141aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1142afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1143afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1144d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 11451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1146d49cd2b7SBarry Smith loTriFactorT->solveInfo, 1147d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1148d49cd2b7SBarry Smith #else 1149d49cd2b7SBarry Smith loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1150afb2bd1cSJunchao Zhang #endif 1151da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1152da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1153aa372e3fSPaul Mullowney 1154da79fbbcSStefano Zampini /* assign the pointer */ 1155aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1156aa372e3fSPaul Mullowney 1157aa372e3fSPaul Mullowney /*********************************************/ 1158aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1159aa372e3fSPaul Mullowney /*********************************************/ 1160aa372e3fSPaul Mullowney 1161aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1162da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1163da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1164aa372e3fSPaul Mullowney 1165aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1166aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1167aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1168aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1169aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1170aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1171aa372e3fSPaul Mullowney 1172aa372e3fSPaul Mullowney /* Create the matrix description */ 117357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 117457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 117557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 117657d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 117757d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1178aa372e3fSPaul Mullowney 1179aa372e3fSPaul Mullowney /* set the operation */ 1180aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1181aa372e3fSPaul Mullowney 1182aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1183aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1184afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1185afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1186aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1187afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1188afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1189afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1190aa372e3fSPaul Mullowney 1191aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1192afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1193afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1194afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1195afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1196afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1197afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1198afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1199afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1200afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1201afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1202afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1203afb2bd1cSJunchao Zhang #endif 1204afb2bd1cSJunchao Zhang 1205da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1206aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1207aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1208aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1209aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1210aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1211aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1212afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1213afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1214afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1215d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1216afb2bd1cSJunchao Zhang #else 1217afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1218d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1219afb2bd1cSJunchao Zhang #endif 1220d49cd2b7SBarry Smith 1221da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1222da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1223aa372e3fSPaul Mullowney 1224afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1225da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1226afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 12271b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1228afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1229afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1230afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1231afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1232afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1233afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1234afb2bd1cSJunchao Zhang #endif 1235afb2bd1cSJunchao Zhang 1236afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1237aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1238afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1239afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1240d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 12411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1242d49cd2b7SBarry Smith upTriFactorT->solveInfo, 1243d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1244d49cd2b7SBarry Smith #else 1245d49cd2b7SBarry Smith upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1246afb2bd1cSJunchao Zhang #endif 1247d49cd2b7SBarry Smith 1248da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1249da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1250aa372e3fSPaul Mullowney 1251da79fbbcSStefano Zampini /* assign the pointer */ 1252aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1253bda325fcSPaul Mullowney PetscFunctionReturn(0); 1254bda325fcSPaul Mullowney } 1255bda325fcSPaul Mullowney 1256a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1257a49f1ed0SStefano Zampini { 1258a49f1ed0SStefano Zampini __host__ __device__ 1259a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1260a49f1ed0SStefano Zampini { 1261a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1262a49f1ed0SStefano Zampini } 1263a49f1ed0SStefano Zampini }; 1264a49f1ed0SStefano Zampini 12653606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1266bda325fcSPaul Mullowney { 1267aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1268a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1269bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1270bda325fcSPaul Mullowney cusparseStatus_t stat; 1271aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1272b06137fdSPaul Mullowney cudaError_t err; 127385ba7357SStefano Zampini PetscErrorCode ierr; 1274b175d8bbSPaul Mullowney 1275bda325fcSPaul Mullowney PetscFunctionBegin; 1276a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1277a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1278e8d2b73aSMark Adams if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1279a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1280e8d2b73aSMark Adams if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12811a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 128285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1283ee7b52eaSHong Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1284a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1285a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1286a49f1ed0SStefano Zampini } 1287a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1288aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 128957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1290aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 129157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 129257d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1293aa372e3fSPaul Mullowney 1294b06137fdSPaul Mullowney /* set alpha and beta */ 1295afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12967656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12977656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1298afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12997656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 13007656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1301b06137fdSPaul Mullowney 1302aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1303aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1304a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1305554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1306554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1307aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1308a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1309aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1310aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1311a3fdcf43SKarl Rupp 1312039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 131381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1314afb2bd1cSJunchao Zhang 1315afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 13163606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1317afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1318afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1319afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1320afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1321afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1322afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 13233606e59fSJunchao Zhang #else 13243606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 13253606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 13263606e59fSJunchao Zhang 13273606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 13283606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 13293606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 13303606e59fSJunchao Zhang */ 13313606e59fSJunchao Zhang if (matrixT->num_entries) { 13323606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 13333606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 13343606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 13353606e59fSJunchao Zhang matrixT->values->data().get(), 13363606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 13373606e59fSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 13383606e59fSJunchao Zhang 13393606e59fSJunchao Zhang } else { 13403606e59fSJunchao Zhang matstructT->matDescr = NULL; 13413606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13423606e59fSJunchao Zhang } 13433606e59fSJunchao Zhang #endif 1344afb2bd1cSJunchao Zhang #endif 1345aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1346afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1347afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1348afb2bd1cSJunchao Zhang #else 1349aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 135051c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 135151c6d536SStefano Zampini /* First convert HYB to CSR */ 1352aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1353aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1354aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1355aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1356aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1357aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1358aa372e3fSPaul Mullowney 1359aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1360aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1361aa372e3fSPaul Mullowney temp->values->data().get(), 1362aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 136357d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1364aa372e3fSPaul Mullowney 1365aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1366aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1367aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1368aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1369aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1370aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1371aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1372aa372e3fSPaul Mullowney 1373aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1374aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1375aa372e3fSPaul Mullowney temp->values->data().get(), 1376aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1377aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1378aa372e3fSPaul Mullowney tempT->values->data().get(), 1379aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1380aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 138157d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1382aa372e3fSPaul Mullowney 1383aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1384aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 138557d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1386aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1387aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1388aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1389aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1390aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1391aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 139257d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1393aa372e3fSPaul Mullowney 1394aa372e3fSPaul Mullowney /* assign the pointer */ 1395aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13961a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1397aa372e3fSPaul Mullowney /* delete temporaries */ 1398aa372e3fSPaul Mullowney if (tempT) { 1399aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1400aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1401aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1402aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1403087f3262SPaul Mullowney } 1404aa372e3fSPaul Mullowney if (temp) { 1405aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1406aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1407aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1408aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1409aa372e3fSPaul Mullowney } 1410afb2bd1cSJunchao Zhang #endif 1411aa372e3fSPaul Mullowney } 1412a49f1ed0SStefano Zampini } 1413a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1414a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1415a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1416e8d2b73aSMark Adams if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1417e8d2b73aSMark Adams if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1418e8d2b73aSMark Adams if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1419e8d2b73aSMark Adams if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1420e8d2b73aSMark Adams if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1421e8d2b73aSMark Adams if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1422e8d2b73aSMark Adams if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1423e8d2b73aSMark Adams if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1424a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1425a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1426a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1427a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1428a49f1ed0SStefano Zampini } 1429a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1430a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1431a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1432a49f1ed0SStefano Zampini 1433a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1434a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1435a49f1ed0SStefano Zampini void *csr2cscBuffer; 1436a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1437a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1438a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1439a49f1ed0SStefano Zampini matrix->values->data().get(), 1440a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1441a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1442a49f1ed0SStefano Zampini matrixT->values->data().get(), 1443a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1444a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1445a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1446a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1447a49f1ed0SStefano Zampini #endif 1448a49f1ed0SStefano Zampini 14491a2c6b5cSJunchao Zhang if (matrix->num_entries) { 14501a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 14511a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 14521a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 14531a2c6b5cSJunchao Zhang 14541a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 14551a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 14561a2c6b5cSJunchao Zhang */ 14571a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 14581a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 14591a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 14601a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 14611a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1462a49f1ed0SStefano Zampini matrixT->values->data().get(), 1463a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1464a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1465a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 14661a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1467a49f1ed0SStefano Zampini #else 1468a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14691a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1470a49f1ed0SStefano Zampini #endif 14711a2c6b5cSJunchao Zhang } else { 14721a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14731a2c6b5cSJunchao Zhang } 14741a2c6b5cSJunchao Zhang 1475a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1476a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1477a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1478a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1479a49f1ed0SStefano Zampini #endif 1480a49f1ed0SStefano Zampini } 1481a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1482a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1483a49f1ed0SStefano Zampini matrixT->values->begin())); 1484a49f1ed0SStefano Zampini } 1485ee7b52eaSHong Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 148685ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1487213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1488213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1489aa372e3fSPaul Mullowney /* assign the pointer */ 1490aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14911a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1492bda325fcSPaul Mullowney PetscFunctionReturn(0); 1493bda325fcSPaul Mullowney } 1494bda325fcSPaul Mullowney 1495a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14966fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1497bda325fcSPaul Mullowney { 1498c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1499465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1500465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1501465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1502465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1503bda325fcSPaul Mullowney cusparseStatus_t stat; 1504bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1505aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1506aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1507aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1508b175d8bbSPaul Mullowney PetscErrorCode ierr; 1509bda325fcSPaul Mullowney 1510bda325fcSPaul Mullowney PetscFunctionBegin; 1511aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1512aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1513bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1514aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1515aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1516bda325fcSPaul Mullowney } 1517bda325fcSPaul Mullowney 1518bda325fcSPaul Mullowney /* Get the GPU pointers */ 1519c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1520c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1521c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1522c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1523bda325fcSPaul Mullowney 15247a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1525aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1526a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1527c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1528c41cb2e2SAlejandro Lamas Daviña xGPU); 1529aa372e3fSPaul Mullowney 1530aa372e3fSPaul Mullowney /* First, solve U */ 1531aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1532afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1534afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1535afb2bd1cSJunchao Zhang #endif 1536afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1537aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1538aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1539aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1540aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1541d49cd2b7SBarry Smith xarray, 15421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543d49cd2b7SBarry Smith tempGPU->data().get(), 1544d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1545d49cd2b7SBarry Smith #else 1546d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1547afb2bd1cSJunchao Zhang #endif 1548aa372e3fSPaul Mullowney 1549aa372e3fSPaul Mullowney /* Then, solve L */ 1550aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1551afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1553afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1554afb2bd1cSJunchao Zhang #endif 1555afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1556aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1557aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1558aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1559aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1560d49cd2b7SBarry Smith tempGPU->data().get(), 15611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562d49cd2b7SBarry Smith xarray, 1563d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1564d49cd2b7SBarry Smith #else 1565d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1566afb2bd1cSJunchao Zhang #endif 1567aa372e3fSPaul Mullowney 1568aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1569a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1570c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1571aa372e3fSPaul Mullowney tempGPU->begin()); 1572aa372e3fSPaul Mullowney 1573aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1574a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1575bda325fcSPaul Mullowney 1576bda325fcSPaul Mullowney /* restore */ 1577c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1578c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1579661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1580958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1581bda325fcSPaul Mullowney PetscFunctionReturn(0); 1582bda325fcSPaul Mullowney } 1583bda325fcSPaul Mullowney 15846fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1585bda325fcSPaul Mullowney { 1586465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1587465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1588bda325fcSPaul Mullowney cusparseStatus_t stat; 1589bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1590aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1591aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1592aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1593b175d8bbSPaul Mullowney PetscErrorCode ierr; 1594bda325fcSPaul Mullowney 1595bda325fcSPaul Mullowney PetscFunctionBegin; 1596aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1597aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1598bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1599aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1600aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1601bda325fcSPaul Mullowney } 1602bda325fcSPaul Mullowney 1603bda325fcSPaul Mullowney /* Get the GPU pointers */ 1604c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1605c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1606bda325fcSPaul Mullowney 16077a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1608aa372e3fSPaul Mullowney /* First, solve U */ 1609aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1610afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 16111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1613afb2bd1cSJunchao Zhang #endif 1614afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1615aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1616aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1617aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1618aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1619d49cd2b7SBarry Smith barray, 16201b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1621d49cd2b7SBarry Smith tempGPU->data().get(), 1622d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1623d49cd2b7SBarry Smith #else 1624d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1625afb2bd1cSJunchao Zhang #endif 1626aa372e3fSPaul Mullowney 1627aa372e3fSPaul Mullowney /* Then, solve L */ 1628aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1629afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 16301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1632afb2bd1cSJunchao Zhang #endif 1633afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1634aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1635aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1636aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1637aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1638d49cd2b7SBarry Smith tempGPU->data().get(), 16391b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1640d49cd2b7SBarry Smith xarray, 1641d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1642d49cd2b7SBarry Smith #else 1643d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1644afb2bd1cSJunchao Zhang #endif 1645bda325fcSPaul Mullowney 1646bda325fcSPaul Mullowney /* restore */ 1647c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1648c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1649661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1650958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1651bda325fcSPaul Mullowney PetscFunctionReturn(0); 1652bda325fcSPaul Mullowney } 1653bda325fcSPaul Mullowney 16546fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 16559ae82921SPaul Mullowney { 1656465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1657465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1658465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1659465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16609ae82921SPaul Mullowney cusparseStatus_t stat; 16619ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1662aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1663aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1664aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1665b175d8bbSPaul Mullowney PetscErrorCode ierr; 16669ae82921SPaul Mullowney 16679ae82921SPaul Mullowney PetscFunctionBegin; 1668ebc8f436SDominic Meiser 1669e057df02SPaul Mullowney /* Get the GPU pointers */ 1670c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1671c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1672c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1673c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16749ae82921SPaul Mullowney 16757a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1676aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1677a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1678c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16794e4bbfaaSStefano Zampini tempGPU->begin()); 1680aa372e3fSPaul Mullowney 1681aa372e3fSPaul Mullowney /* Next, solve L */ 1682aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1683afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1685afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1686afb2bd1cSJunchao Zhang #endif 1687afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1688aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1689aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1690aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1691aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1692d49cd2b7SBarry Smith tempGPU->data().get(), 16931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1694d49cd2b7SBarry Smith xarray, 1695d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1696d49cd2b7SBarry Smith #else 1697d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1698afb2bd1cSJunchao Zhang #endif 1699aa372e3fSPaul Mullowney 1700aa372e3fSPaul Mullowney /* Then, solve U */ 1701aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1702afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1704afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1705afb2bd1cSJunchao Zhang #endif 1706afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1707aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1708aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1709aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1710d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 17111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1712d49cd2b7SBarry Smith tempGPU->data().get(), 1713d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1714d49cd2b7SBarry Smith #else 1715d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1716afb2bd1cSJunchao Zhang #endif 1717d49cd2b7SBarry Smith 17184e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1719a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 17204e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 17214e4bbfaaSStefano Zampini xGPU); 17229ae82921SPaul Mullowney 1723c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1724c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1725661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1726958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17279ae82921SPaul Mullowney PetscFunctionReturn(0); 17289ae82921SPaul Mullowney } 17299ae82921SPaul Mullowney 17306fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 17319ae82921SPaul Mullowney { 1732465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1733465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 17349ae82921SPaul Mullowney cusparseStatus_t stat; 17359ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1736aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1737aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1738aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1739b175d8bbSPaul Mullowney PetscErrorCode ierr; 17409ae82921SPaul Mullowney 17419ae82921SPaul Mullowney PetscFunctionBegin; 1742e057df02SPaul Mullowney /* Get the GPU pointers */ 1743c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1744c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 17459ae82921SPaul Mullowney 17467a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1747aa372e3fSPaul Mullowney /* First, solve L */ 1748aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1749afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 17501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1751afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1752afb2bd1cSJunchao Zhang #endif 1753afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1754aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1755aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1756aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1757aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1758d49cd2b7SBarry Smith barray, 17591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1760d49cd2b7SBarry Smith tempGPU->data().get(), 1761d49cd2b7SBarry Smith loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1762d49cd2b7SBarry Smith #else 1763d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1764afb2bd1cSJunchao Zhang #endif 1765d49cd2b7SBarry Smith 1766aa372e3fSPaul Mullowney /* Next, solve U */ 1767aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1768afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17691b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1770afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1771afb2bd1cSJunchao Zhang #endif 1772afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1773aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1774aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1775aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1776aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1777d49cd2b7SBarry Smith tempGPU->data().get(), 17781b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1779d49cd2b7SBarry Smith xarray, 1780d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1781d49cd2b7SBarry Smith #else 1782d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1783afb2bd1cSJunchao Zhang #endif 17849ae82921SPaul Mullowney 1785c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1786c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1787661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1788958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17899ae82921SPaul Mullowney PetscFunctionReturn(0); 17909ae82921SPaul Mullowney } 17919ae82921SPaul Mullowney 17927e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17937e8381f9SStefano Zampini { 17947e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17957e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17967e8381f9SStefano Zampini cudaError_t cerr; 17977e8381f9SStefano Zampini PetscErrorCode ierr; 17987e8381f9SStefano Zampini 17997e8381f9SStefano Zampini PetscFunctionBegin; 18007e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 18017e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 18027e8381f9SStefano Zampini 18037e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 18047e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 18057e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 18067e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 18077e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 18087e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 18097e8381f9SStefano Zampini } 18107e8381f9SStefano Zampini PetscFunctionReturn(0); 18117e8381f9SStefano Zampini } 18127e8381f9SStefano Zampini 18137e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 18147e8381f9SStefano Zampini { 18157e8381f9SStefano Zampini PetscErrorCode ierr; 18167e8381f9SStefano Zampini 18177e8381f9SStefano Zampini PetscFunctionBegin; 18187e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 181967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 182067a45760SJunchao Zhang PetscFunctionReturn(0); 182167a45760SJunchao Zhang } 182267a45760SJunchao Zhang 182367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 182467a45760SJunchao Zhang { 182567a45760SJunchao Zhang PetscFunctionBegin; 18267e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 182767a45760SJunchao Zhang *array = NULL; 182867a45760SJunchao Zhang PetscFunctionReturn(0); 182967a45760SJunchao Zhang } 183067a45760SJunchao Zhang 183167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 183267a45760SJunchao Zhang { 183367a45760SJunchao Zhang PetscErrorCode ierr; 183467a45760SJunchao Zhang 183567a45760SJunchao Zhang PetscFunctionBegin; 183667a45760SJunchao Zhang ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 183767a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 183867a45760SJunchao Zhang PetscFunctionReturn(0); 183967a45760SJunchao Zhang } 184067a45760SJunchao Zhang 184167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 184267a45760SJunchao Zhang { 184367a45760SJunchao Zhang PetscFunctionBegin; 184467a45760SJunchao Zhang *array = NULL; 184567a45760SJunchao Zhang PetscFunctionReturn(0); 184667a45760SJunchao Zhang } 184767a45760SJunchao Zhang 184867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 184967a45760SJunchao Zhang { 185067a45760SJunchao Zhang PetscFunctionBegin; 185167a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 185267a45760SJunchao Zhang PetscFunctionReturn(0); 185367a45760SJunchao Zhang } 185467a45760SJunchao Zhang 185567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 185667a45760SJunchao Zhang { 185767a45760SJunchao Zhang PetscFunctionBegin; 185867a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 185967a45760SJunchao Zhang *array = NULL; 18607e8381f9SStefano Zampini PetscFunctionReturn(0); 18617e8381f9SStefano Zampini } 18627e8381f9SStefano Zampini 1863042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 18649ae82921SPaul Mullowney { 1865aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 18667c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 18679ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1868213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 18699ae82921SPaul Mullowney PetscErrorCode ierr; 1870aa372e3fSPaul Mullowney cusparseStatus_t stat; 1871abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1872b06137fdSPaul Mullowney cudaError_t err; 18739ae82921SPaul Mullowney 18749ae82921SPaul Mullowney PetscFunctionBegin; 1875e8d2b73aSMark Adams if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1876c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1877a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1878a49f1ed0SStefano Zampini CsrMatrix *matrix; 1879afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 188085ba7357SStefano Zampini 1881e8d2b73aSMark Adams if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 188285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1883afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 188405035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 18854863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 188685ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1887a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 188834d6c7a5SJose E. Roman } else { 1889abb89eb1SStefano Zampini PetscInt nnz; 189085ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 18917c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1892a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 18937c700b8dSJunchao Zhang delete cusparsestruct->workVector; 189481902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1895a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1896a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18979ae82921SPaul Mullowney try { 18989ae82921SPaul Mullowney if (a->compressedrow.use) { 18999ae82921SPaul Mullowney m = a->compressedrow.nrows; 19009ae82921SPaul Mullowney ii = a->compressedrow.i; 19019ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 19029ae82921SPaul Mullowney } else { 1903213423ffSJunchao Zhang m = A->rmap->n; 1904213423ffSJunchao Zhang ii = a->i; 1905e6e9a74fSStefano Zampini ridx = NULL; 19069ae82921SPaul Mullowney } 1907e8d2b73aSMark Adams if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1908e8d2b73aSMark Adams if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1909abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1910abb89eb1SStefano Zampini else nnz = a->nz; 19119ae82921SPaul Mullowney 191285ba7357SStefano Zampini /* create cusparse matrix */ 1913abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1914aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 191557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 191657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 191757d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 19189ae82921SPaul Mullowney 1919afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 19207656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 19217656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1922afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 19237656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 19247656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 192557d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1926b06137fdSPaul Mullowney 1927aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1928aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1929aa372e3fSPaul Mullowney /* set the matrix */ 1930afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1931afb2bd1cSJunchao Zhang mat->num_rows = m; 1932afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1933abb89eb1SStefano Zampini mat->num_entries = nnz; 1934afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1935afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 19369ae82921SPaul Mullowney 1937abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1938abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1939aa372e3fSPaul Mullowney 1940abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1941abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1942aa372e3fSPaul Mullowney 1943aa372e3fSPaul Mullowney /* assign the pointer */ 1944afb2bd1cSJunchao Zhang matstruct->mat = mat; 1945afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1946afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1947afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1948afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1949afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1950afb2bd1cSJunchao Zhang mat->values->data().get(), 1951afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1952afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1953afb2bd1cSJunchao Zhang } 1954afb2bd1cSJunchao Zhang #endif 1955aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1956afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1957afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1958afb2bd1cSJunchao Zhang #else 1959afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1960afb2bd1cSJunchao Zhang mat->num_rows = m; 1961afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1962abb89eb1SStefano Zampini mat->num_entries = nnz; 1963afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1964afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1965aa372e3fSPaul Mullowney 1966abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1967abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1968aa372e3fSPaul Mullowney 1969abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1970abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1971aa372e3fSPaul Mullowney 1972aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 197357d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1974aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1975aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1976afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1977afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1978afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1979afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 198057d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1981aa372e3fSPaul Mullowney /* assign the pointer */ 1982aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1983aa372e3fSPaul Mullowney 1984afb2bd1cSJunchao Zhang if (mat) { 1985afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1986afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1987afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1988afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1989087f3262SPaul Mullowney } 1990afb2bd1cSJunchao Zhang #endif 1991087f3262SPaul Mullowney } 1992ca45077fSPaul Mullowney 1993aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1994213423ffSJunchao Zhang if (a->compressedrow.use) { 1995213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1996aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1997aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1998213423ffSJunchao Zhang tmp = m; 1999213423ffSJunchao Zhang } else { 2000213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2001213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2002213423ffSJunchao Zhang tmp = 0; 2003213423ffSJunchao Zhang } 2004213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2005aa372e3fSPaul Mullowney 2006aa372e3fSPaul Mullowney /* assign the pointer */ 2007aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 20089ae82921SPaul Mullowney } catch(char *ex) { 20099ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 20109ae82921SPaul Mullowney } 201105035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 201285ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 201334d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 201434d6c7a5SJose E. Roman } 2015abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 20169ae82921SPaul Mullowney } 20179ae82921SPaul Mullowney PetscFunctionReturn(0); 20189ae82921SPaul Mullowney } 20199ae82921SPaul Mullowney 2020c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 2021aa372e3fSPaul Mullowney { 2022aa372e3fSPaul Mullowney template <typename Tuple> 2023aa372e3fSPaul Mullowney __host__ __device__ 2024aa372e3fSPaul Mullowney void operator()(Tuple t) 2025aa372e3fSPaul Mullowney { 2026aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2027aa372e3fSPaul Mullowney } 2028aa372e3fSPaul Mullowney }; 2029aa372e3fSPaul Mullowney 20307e8381f9SStefano Zampini struct VecCUDAEquals 20317e8381f9SStefano Zampini { 20327e8381f9SStefano Zampini template <typename Tuple> 20337e8381f9SStefano Zampini __host__ __device__ 20347e8381f9SStefano Zampini void operator()(Tuple t) 20357e8381f9SStefano Zampini { 20367e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 20377e8381f9SStefano Zampini } 20387e8381f9SStefano Zampini }; 20397e8381f9SStefano Zampini 2040e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 2041e6e9a74fSStefano Zampini { 2042e6e9a74fSStefano Zampini template <typename Tuple> 2043e6e9a74fSStefano Zampini __host__ __device__ 2044e6e9a74fSStefano Zampini void operator()(Tuple t) 2045e6e9a74fSStefano Zampini { 2046e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2047e6e9a74fSStefano Zampini } 2048e6e9a74fSStefano Zampini }; 2049e6e9a74fSStefano Zampini 2050afb2bd1cSJunchao Zhang struct MatMatCusparse { 2051ccdfe979SStefano Zampini PetscBool cisdense; 2052ccdfe979SStefano Zampini PetscScalar *Bt; 2053ccdfe979SStefano Zampini Mat X; 2054fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2055fcdce8c4SStefano Zampini PetscLogDouble flops; 2056fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2057b4285af6SJunchao Zhang 2058afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2059fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2060afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2061afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2062afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2063afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2064b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2065b4285af6SJunchao Zhang void *dBuffer4; 2066b4285af6SJunchao Zhang void *dBuffer5; 2067b4285af6SJunchao Zhang #endif 2068fcdce8c4SStefano Zampini size_t mmBufferSize; 2069fcdce8c4SStefano Zampini void *mmBuffer; 2070fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2071fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2072afb2bd1cSJunchao Zhang #endif 2073afb2bd1cSJunchao Zhang }; 2074ccdfe979SStefano Zampini 2075ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2076ccdfe979SStefano Zampini { 2077ccdfe979SStefano Zampini PetscErrorCode ierr; 2078ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2079ccdfe979SStefano Zampini cudaError_t cerr; 2080fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2081fcdce8c4SStefano Zampini cusparseStatus_t stat; 2082fcdce8c4SStefano Zampini #endif 2083ccdfe979SStefano Zampini 2084ccdfe979SStefano Zampini PetscFunctionBegin; 2085ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2086fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2087afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2089afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2090afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2091fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2092b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2093b4285af6SJunchao Zhang if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2094b4285af6SJunchao Zhang if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2095b4285af6SJunchao Zhang #endif 2096b4285af6SJunchao Zhang if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2097b4285af6SJunchao Zhang if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2098afb2bd1cSJunchao Zhang #endif 2099ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2100ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 2101ccdfe979SStefano Zampini PetscFunctionReturn(0); 2102ccdfe979SStefano Zampini } 2103ccdfe979SStefano Zampini 2104ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2105ccdfe979SStefano Zampini 2106ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2107ccdfe979SStefano Zampini { 2108ccdfe979SStefano Zampini Mat_Product *product = C->product; 2109ccdfe979SStefano Zampini Mat A,B; 2110afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2111ccdfe979SStefano Zampini PetscBool flg,biscuda; 2112ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2113ccdfe979SStefano Zampini cusparseStatus_t stat; 2114ccdfe979SStefano Zampini cusparseOperation_t opA; 2115ccdfe979SStefano Zampini const PetscScalar *barray; 2116ccdfe979SStefano Zampini PetscScalar *carray; 2117ccdfe979SStefano Zampini PetscErrorCode ierr; 2118ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2119ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2120ccdfe979SStefano Zampini CsrMatrix *csrmat; 2121ccdfe979SStefano Zampini 2122ccdfe979SStefano Zampini PetscFunctionBegin; 2123ccdfe979SStefano Zampini MatCheckProduct(C,1); 2124e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2125ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2126ccdfe979SStefano Zampini A = product->A; 2127ccdfe979SStefano Zampini B = product->B; 2128ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2129e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2130ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2131ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 2132ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2133ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2134ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2135ccdfe979SStefano Zampini switch (product->type) { 2136ccdfe979SStefano Zampini case MATPRODUCT_AB: 2137ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2138ccdfe979SStefano Zampini mat = cusp->mat; 2139ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2140ccdfe979SStefano Zampini m = A->rmap->n; 2141ccdfe979SStefano Zampini n = B->cmap->n; 2142ccdfe979SStefano Zampini break; 2143ccdfe979SStefano Zampini case MATPRODUCT_AtB: 21441a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2145e6e9a74fSStefano Zampini mat = cusp->mat; 2146e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2147e6e9a74fSStefano Zampini } else { 21483606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2149ccdfe979SStefano Zampini mat = cusp->matTranspose; 2150ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2151e6e9a74fSStefano Zampini } 2152ccdfe979SStefano Zampini m = A->cmap->n; 2153ccdfe979SStefano Zampini n = B->cmap->n; 2154ccdfe979SStefano Zampini break; 2155ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2156ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2157ccdfe979SStefano Zampini mat = cusp->mat; 2158ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2159ccdfe979SStefano Zampini m = A->rmap->n; 2160ccdfe979SStefano Zampini n = B->rmap->n; 2161ccdfe979SStefano Zampini break; 2162ccdfe979SStefano Zampini default: 2163e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2164ccdfe979SStefano Zampini } 2165e8d2b73aSMark Adams if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2166ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2167ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2168ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2169afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2170ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2171afb2bd1cSJunchao Zhang 2172ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2173c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2174c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2175c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2176c8378d12SStefano Zampini } else { 2177c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2178c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2179c8378d12SStefano Zampini } 2180c8378d12SStefano Zampini 2181c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2182afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2183afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2184a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2185afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2186fcdce8c4SStefano Zampini size_t mmBufferSize; 2187afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2188afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2189afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2190afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2191afb2bd1cSJunchao Zhang } 2192c8378d12SStefano Zampini 2193afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2194afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2195afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2196afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2197afb2bd1cSJunchao Zhang } 2198afb2bd1cSJunchao Zhang 2199afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2200afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2201afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2202afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2203afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2204afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2205afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2206afb2bd1cSJunchao Zhang } 2207afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2208afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2209afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2210fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2211fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2212ee7b52eaSHong Zhang cudaError_t cerr; 2213fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2214fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2215fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2216fcdce8c4SStefano Zampini } 2217afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2218afb2bd1cSJunchao Zhang } else { 2219afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2220afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2221afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2222afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2223afb2bd1cSJunchao Zhang } 2224afb2bd1cSJunchao Zhang 2225afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2226afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2227afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2228afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2229fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2230afb2bd1cSJunchao Zhang #else 2231afb2bd1cSJunchao Zhang PetscInt k; 2232afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2233ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2234ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2235ccdfe979SStefano Zampini cublasStatus_t cerr; 2236ccdfe979SStefano Zampini 2237ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2238ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2239ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2240ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2241ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2242ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2243ccdfe979SStefano Zampini blda = B->cmap->n; 2244afb2bd1cSJunchao Zhang k = B->cmap->n; 2245afb2bd1cSJunchao Zhang } else { 2246afb2bd1cSJunchao Zhang k = B->rmap->n; 2247ccdfe979SStefano Zampini } 2248ccdfe979SStefano Zampini 2249afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2250ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2251afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2252ccdfe979SStefano Zampini csrmat->values->data().get(), 2253ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2254ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2255ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2256ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2257afb2bd1cSJunchao Zhang #endif 2258c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2259c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2260ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2261ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2262ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2263ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2264ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2265ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2266ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2267ccdfe979SStefano Zampini } else { 2268ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2269ccdfe979SStefano Zampini } 2270ccdfe979SStefano Zampini if (mmdata->cisdense) { 2271ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2272ccdfe979SStefano Zampini } 2273ccdfe979SStefano Zampini if (!biscuda) { 2274ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2275ccdfe979SStefano Zampini } 2276ccdfe979SStefano Zampini PetscFunctionReturn(0); 2277ccdfe979SStefano Zampini } 2278ccdfe979SStefano Zampini 2279ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2280ccdfe979SStefano Zampini { 2281ccdfe979SStefano Zampini Mat_Product *product = C->product; 2282ccdfe979SStefano Zampini Mat A,B; 2283ccdfe979SStefano Zampini PetscInt m,n; 2284ccdfe979SStefano Zampini PetscBool cisdense,flg; 2285ccdfe979SStefano Zampini PetscErrorCode ierr; 2286ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2287ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2288ccdfe979SStefano Zampini 2289ccdfe979SStefano Zampini PetscFunctionBegin; 2290ccdfe979SStefano Zampini MatCheckProduct(C,1); 2291e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2292ccdfe979SStefano Zampini A = product->A; 2293ccdfe979SStefano Zampini B = product->B; 2294ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2295e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2296ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2297e8d2b73aSMark Adams if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2298ccdfe979SStefano Zampini switch (product->type) { 2299ccdfe979SStefano Zampini case MATPRODUCT_AB: 2300ccdfe979SStefano Zampini m = A->rmap->n; 2301ccdfe979SStefano Zampini n = B->cmap->n; 2302ccdfe979SStefano Zampini break; 2303ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2304ccdfe979SStefano Zampini m = A->cmap->n; 2305ccdfe979SStefano Zampini n = B->cmap->n; 2306ccdfe979SStefano Zampini break; 2307ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2308ccdfe979SStefano Zampini m = A->rmap->n; 2309ccdfe979SStefano Zampini n = B->rmap->n; 2310ccdfe979SStefano Zampini break; 2311ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2312ccdfe979SStefano Zampini m = B->cmap->n; 2313ccdfe979SStefano Zampini n = B->cmap->n; 2314ccdfe979SStefano Zampini break; 2315ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2316ccdfe979SStefano Zampini m = B->rmap->n; 2317ccdfe979SStefano Zampini n = B->rmap->n; 2318ccdfe979SStefano Zampini break; 2319ccdfe979SStefano Zampini default: 2320e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2321ccdfe979SStefano Zampini } 2322ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2323ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2324ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2325ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2326ccdfe979SStefano Zampini 2327ccdfe979SStefano Zampini /* product data */ 2328ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2329ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2330afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2331afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2332ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2333afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2334ccdfe979SStefano Zampini } 2335afb2bd1cSJunchao Zhang #endif 2336ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2337ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2338ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2339ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2340ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2341ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2342ccdfe979SStefano Zampini } else { 2343ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2344ccdfe979SStefano Zampini } 2345ccdfe979SStefano Zampini } 2346ccdfe979SStefano Zampini C->product->data = mmdata; 2347ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2348ccdfe979SStefano Zampini 2349ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2350ccdfe979SStefano Zampini PetscFunctionReturn(0); 2351ccdfe979SStefano Zampini } 2352ccdfe979SStefano Zampini 2353fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2354ccdfe979SStefano Zampini { 2355ccdfe979SStefano Zampini Mat_Product *product = C->product; 2356fcdce8c4SStefano Zampini Mat A,B; 2357fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2358fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2359fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2360fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2361fcdce8c4SStefano Zampini PetscBool flg; 2362ccdfe979SStefano Zampini PetscErrorCode ierr; 2363fcdce8c4SStefano Zampini cusparseStatus_t stat; 2364fcdce8c4SStefano Zampini cudaError_t cerr; 2365fcdce8c4SStefano Zampini MatProductType ptype; 2366fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2367fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2368fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2369fcdce8c4SStefano Zampini #endif 2370b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2371ccdfe979SStefano Zampini 2372ccdfe979SStefano Zampini PetscFunctionBegin; 2373ccdfe979SStefano Zampini MatCheckProduct(C,1); 2374e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2375fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2376e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2377fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2378fcdce8c4SStefano Zampini A = product->A; 2379fcdce8c4SStefano Zampini B = product->B; 2380fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2381fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2382fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2383e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2384fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2385e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2386fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2387e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2388fcdce8c4SStefano Zampini goto finalize; 2389fcdce8c4SStefano Zampini } 2390fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2391fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2392e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2393fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2394e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2395fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2396fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2397fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2398fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2399fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2400e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2401e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2402e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2403fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2404fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2405fcdce8c4SStefano Zampini 2406fcdce8c4SStefano Zampini ptype = product->type; 2407fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2408fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2409fa046f9fSJunchao Zhang if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2410fa046f9fSJunchao Zhang } 2411fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2412fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2413fa046f9fSJunchao Zhang if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2414fa046f9fSJunchao Zhang } 2415fcdce8c4SStefano Zampini switch (ptype) { 2416fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2417fcdce8c4SStefano Zampini Amat = Acusp->mat; 2418fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2419fcdce8c4SStefano Zampini break; 2420fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2421fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2422fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2423fcdce8c4SStefano Zampini break; 2424fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2425fcdce8c4SStefano Zampini Amat = Acusp->mat; 2426fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2427fcdce8c4SStefano Zampini break; 2428fcdce8c4SStefano Zampini default: 2429e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2430fcdce8c4SStefano Zampini } 2431fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2432e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2433e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2434e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2435fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2436fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2437fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2438e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2439e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2440e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2441fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2442fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2443fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2444b4285af6SJunchao Zhang stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2445b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2446b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2447b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2448b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2449b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2450b4285af6SJunchao Zhang #else 2451b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2452fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2453fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2454fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2455b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2456fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2457fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2458b4285af6SJunchao Zhang #endif 2459fcdce8c4SStefano Zampini #else 2460b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2461fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2462fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2463fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2464fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2465fcdce8c4SStefano Zampini #endif 2466fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2467fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2468fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2469fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2470fcdce8c4SStefano Zampini finalize: 2471fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2472fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2473fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2474fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2475fcdce8c4SStefano Zampini c->reallocs = 0; 2476fcdce8c4SStefano Zampini C->info.mallocs += 0; 2477fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2478fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2479fcdce8c4SStefano Zampini C->num_ass++; 2480ccdfe979SStefano Zampini PetscFunctionReturn(0); 2481ccdfe979SStefano Zampini } 2482fcdce8c4SStefano Zampini 2483fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2484fcdce8c4SStefano Zampini { 2485fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2486fcdce8c4SStefano Zampini Mat A,B; 2487fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2488fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2489fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2490fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2491fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2492fcdce8c4SStefano Zampini PetscBool flg; 2493fcdce8c4SStefano Zampini PetscErrorCode ierr; 2494fcdce8c4SStefano Zampini cusparseStatus_t stat; 2495fcdce8c4SStefano Zampini cudaError_t cerr; 2496fcdce8c4SStefano Zampini MatProductType ptype; 2497fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2498fcdce8c4SStefano Zampini PetscLogDouble flops; 2499fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2500fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2501fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2502fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2503fcdce8c4SStefano Zampini #else 2504fcdce8c4SStefano Zampini int cnz; 2505fcdce8c4SStefano Zampini #endif 2506b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2507fcdce8c4SStefano Zampini 2508fcdce8c4SStefano Zampini PetscFunctionBegin; 2509fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2510e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2511fcdce8c4SStefano Zampini A = product->A; 2512fcdce8c4SStefano Zampini B = product->B; 2513fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2514e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2515fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2516e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2517fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2518fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2519fcdce8c4SStefano Zampini /* product data */ 2520fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2521fcdce8c4SStefano Zampini C->product->data = mmdata; 2522fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2523fcdce8c4SStefano Zampini 2524fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2525fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2526d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2527d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2528d60bce21SJunchao Zhang if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2529d60bce21SJunchao Zhang if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2530d60bce21SJunchao Zhang 2531fcdce8c4SStefano Zampini ptype = product->type; 2532fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2533fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2534fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2535fa046f9fSJunchao Zhang } 2536fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2537fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2538fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2539fa046f9fSJunchao Zhang } 2540fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2541fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2542fcdce8c4SStefano Zampini switch (ptype) { 2543fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2544fcdce8c4SStefano Zampini m = A->rmap->n; 2545fcdce8c4SStefano Zampini n = B->cmap->n; 2546fcdce8c4SStefano Zampini k = A->cmap->n; 2547fcdce8c4SStefano Zampini Amat = Acusp->mat; 2548fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2549fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2550fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2551fcdce8c4SStefano Zampini break; 2552fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2553fcdce8c4SStefano Zampini m = A->cmap->n; 2554fcdce8c4SStefano Zampini n = B->cmap->n; 2555fcdce8c4SStefano Zampini k = A->rmap->n; 25563606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2557fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2558fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2559fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2560fcdce8c4SStefano Zampini break; 2561fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2562fcdce8c4SStefano Zampini m = A->rmap->n; 2563fcdce8c4SStefano Zampini n = B->rmap->n; 2564fcdce8c4SStefano Zampini k = A->cmap->n; 25653606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2566fcdce8c4SStefano Zampini Amat = Acusp->mat; 2567fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2568fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2569fcdce8c4SStefano Zampini break; 2570fcdce8c4SStefano Zampini default: 2571e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2572fcdce8c4SStefano Zampini } 2573fcdce8c4SStefano Zampini 2574fcdce8c4SStefano Zampini /* create cusparse matrix */ 2575fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2576fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2577fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2578fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2579fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2580fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2581fcdce8c4SStefano Zampini 2582fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2583fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2584fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2585fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2586fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2587fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2588fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2589fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2590fcdce8c4SStefano Zampini } else { 2591fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2592fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2593fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2594fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2595fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2596fcdce8c4SStefano Zampini } 2597fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2598fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2599fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2600fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2601fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2602fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2603fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2604fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2605fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2606fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2607fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2608fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2609fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2610fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2611fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2612fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2613fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2614fcdce8c4SStefano Zampini c->nz = 0; 2615fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2616fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2617fcdce8c4SStefano Zampini goto finalizesym; 2618fcdce8c4SStefano Zampini } 2619fcdce8c4SStefano Zampini 2620e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2621e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2622fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2623fcdce8c4SStefano Zampini if (!biscompressed) { 2624fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2625fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2626fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2627fcdce8c4SStefano Zampini #endif 2628fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2629fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2630fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2631fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2632fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2633fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2634fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2635fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2636fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2637fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2638fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2639fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2640fcdce8c4SStefano Zampini } 2641fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2642fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2643fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2644fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2645fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2646fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2647fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2648fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2649fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2650fcdce8c4SStefano Zampini } 2651fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2652fcdce8c4SStefano Zampini #endif 2653fcdce8c4SStefano Zampini } 2654e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2655e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2656fcdce8c4SStefano Zampini /* precompute flops count */ 2657fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2658fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2659fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2660fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2661fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2662fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2663fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2664fcdce8c4SStefano Zampini } 2665fcdce8c4SStefano Zampini } 2666fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2667fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2668fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2669fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2670fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2671fcdce8c4SStefano Zampini } 2672fcdce8c4SStefano Zampini } else { /* TODO */ 2673fcdce8c4SStefano Zampini flops = 0.; 2674fcdce8c4SStefano Zampini } 2675fcdce8c4SStefano Zampini 2676fcdce8c4SStefano Zampini mmdata->flops = flops; 2677fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2678b4285af6SJunchao Zhang 2679fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2680fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2681fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2682fcdce8c4SStefano Zampini NULL, NULL, NULL, 2683fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2684fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2685fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2686b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2687b4285af6SJunchao Zhang { 2688b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2689b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2690b4285af6SJunchao Zhang */ 2691b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2692b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2693b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2694b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2695b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2696b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2697b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2698b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2699b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2700b4285af6SJunchao Zhang 2701b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2702b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2703b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2704b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2705b4285af6SJunchao Zhang &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2706b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2707b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2708b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2709b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2710b4285af6SJunchao Zhang &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2711b4285af6SJunchao Zhang 2712b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2713b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2714b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2715b4285af6SJunchao Zhang &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2716b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2717b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2718b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2719b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2720b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2721b4285af6SJunchao Zhang &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2722b4285af6SJunchao Zhang cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2723b4285af6SJunchao Zhang cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2724b4285af6SJunchao Zhang 2725b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2726b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 2727b4285af6SJunchao Zhang stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2728b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2729b4285af6SJunchao Zhang /* allocate matrix C */ 2730b4285af6SJunchao Zhang Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2731b4285af6SJunchao Zhang Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2732b4285af6SJunchao Zhang /* update matC with the new pointers */ 2733b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2734b4285af6SJunchao Zhang Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2735b4285af6SJunchao Zhang 2736b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2737b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2738b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2739b4285af6SJunchao Zhang &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2740b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2741b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2742b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2743b4285af6SJunchao Zhang &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2744b4285af6SJunchao Zhang cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2745b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2746b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2747b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2748b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2749b4285af6SJunchao Zhang ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2750b4285af6SJunchao Zhang } 2751b4285af6SJunchao Zhang #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2752b4285af6SJunchao Zhang size_t bufSize2; 2753fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2754b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2755fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2756fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2757fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2758bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2759fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2760b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2761fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2762fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2763fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2764fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2765b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2766fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2767fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2768fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2769fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2770fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2771fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2772fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2773fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2774bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2775fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2776b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2777fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2778fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2779fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2780fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2781fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2782fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 278300702c57SStefano Zampini ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2784fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2785fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2786fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2787fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2788fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2789fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2790b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2791fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2792fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2793b4285af6SJunchao Zhang #endif 2794fcdce8c4SStefano Zampini #else 2795fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2796b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2797fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2798fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2799fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2800fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2801fcdce8c4SStefano Zampini c->nz = cnz; 2802fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2803fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2804fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2805fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2806fcdce8c4SStefano Zampini 2807fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2808fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2809fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2810fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2811b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2812fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2813fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2814fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2815fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2816fcdce8c4SStefano Zampini #endif 2817fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2818fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2819fcdce8c4SStefano Zampini finalizesym: 2820fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2821fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2822fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2823fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2824fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2825fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2826fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2827fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2828fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2829fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2830fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2831fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2832fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2833fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2834fcdce8c4SStefano Zampini } else { 2835fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2836fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2837fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2838fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839fcdce8c4SStefano Zampini } 2840fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2841fcdce8c4SStefano Zampini PetscInt r = 0; 2842fcdce8c4SStefano Zampini c->i[0] = 0; 2843fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2844fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2845fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2846fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2847fcdce8c4SStefano Zampini } 2848fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2849fcdce8c4SStefano Zampini } 2850fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2851fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2852fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2853fcdce8c4SStefano Zampini c->maxnz = c->nz; 2854fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2855fcdce8c4SStefano Zampini c->rmax = 0; 2856fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2857fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2858fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2859fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2860fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2861fcdce8c4SStefano Zampini } 2862fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2863fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2864fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2865fcdce8c4SStefano Zampini 2866fcdce8c4SStefano Zampini C->nonzerostate++; 2867fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2868fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2869fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2870fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2871fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2872fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2873fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2874abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2875fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2876fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2877fcdce8c4SStefano Zampini } 2878fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2879fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2880fcdce8c4SStefano Zampini } 2881fcdce8c4SStefano Zampini 2882fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2883fcdce8c4SStefano Zampini 2884fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2885fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2886fcdce8c4SStefano Zampini { 2887fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2888fcdce8c4SStefano Zampini PetscErrorCode ierr; 2889fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2890fcdce8c4SStefano Zampini 2891fcdce8c4SStefano Zampini PetscFunctionBegin; 2892fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2893fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2894abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2895fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2896fcdce8c4SStefano Zampini } 2897fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2898fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2899fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2900fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2901fcdce8c4SStefano Zampini } 2902fcdce8c4SStefano Zampini } 290365e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 290465e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 290565e4b4d4SStefano Zampini switch (product->type) { 290665e4b4d4SStefano Zampini case MATPRODUCT_AB: 290765e4b4d4SStefano Zampini if (product->api_user) { 290865e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 290965e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 291065e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 291165e4b4d4SStefano Zampini } else { 291265e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 291365e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 291465e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 291565e4b4d4SStefano Zampini } 291665e4b4d4SStefano Zampini break; 291765e4b4d4SStefano Zampini case MATPRODUCT_AtB: 291865e4b4d4SStefano Zampini if (product->api_user) { 291965e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 292065e4b4d4SStefano Zampini ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 292165e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 292265e4b4d4SStefano Zampini } else { 292365e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 292465e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 292565e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 292665e4b4d4SStefano Zampini } 292765e4b4d4SStefano Zampini break; 292865e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 292965e4b4d4SStefano Zampini if (product->api_user) { 293065e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 293165e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 293265e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 293365e4b4d4SStefano Zampini } else { 293465e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 293565e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 293665e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 293765e4b4d4SStefano Zampini } 293865e4b4d4SStefano Zampini break; 293965e4b4d4SStefano Zampini case MATPRODUCT_RARt: 294065e4b4d4SStefano Zampini if (product->api_user) { 294165e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 294265e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 294365e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 294465e4b4d4SStefano Zampini } else { 294565e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 294665e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 294765e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 294865e4b4d4SStefano Zampini } 294965e4b4d4SStefano Zampini break; 295065e4b4d4SStefano Zampini case MATPRODUCT_ABC: 295165e4b4d4SStefano Zampini if (product->api_user) { 295265e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 295365e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 295465e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 295565e4b4d4SStefano Zampini } else { 295665e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 295765e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 295865e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 295965e4b4d4SStefano Zampini } 296065e4b4d4SStefano Zampini break; 296165e4b4d4SStefano Zampini default: 296265e4b4d4SStefano Zampini break; 296365e4b4d4SStefano Zampini } 296465e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 296565e4b4d4SStefano Zampini } 296665e4b4d4SStefano Zampini /* dispatch */ 2967fcdce8c4SStefano Zampini if (isdense) { 2968ccdfe979SStefano Zampini switch (product->type) { 2969ccdfe979SStefano Zampini case MATPRODUCT_AB: 2970ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2971ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2972ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2973ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2974fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2975fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2976fcdce8c4SStefano Zampini } else { 2977fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2978fcdce8c4SStefano Zampini } 2979fcdce8c4SStefano Zampini break; 2980fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2981fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2982fcdce8c4SStefano Zampini break; 2983ccdfe979SStefano Zampini default: 2984ccdfe979SStefano Zampini break; 2985ccdfe979SStefano Zampini } 2986fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2987fcdce8c4SStefano Zampini switch (product->type) { 2988fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2989fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2990fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2991fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2992fcdce8c4SStefano Zampini break; 2993fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2994fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2995fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2996fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2997fcdce8c4SStefano Zampini break; 2998fcdce8c4SStefano Zampini default: 2999fcdce8c4SStefano Zampini break; 3000fcdce8c4SStefano Zampini } 3001fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 3002fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3003fcdce8c4SStefano Zampini } 3004ccdfe979SStefano Zampini PetscFunctionReturn(0); 3005ccdfe979SStefano Zampini } 3006ccdfe979SStefano Zampini 30076fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 30089ae82921SPaul Mullowney { 3009b175d8bbSPaul Mullowney PetscErrorCode ierr; 30109ae82921SPaul Mullowney 30119ae82921SPaul Mullowney PetscFunctionBegin; 3012e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3013e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3014e6e9a74fSStefano Zampini } 3015e6e9a74fSStefano Zampini 3016e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3017e6e9a74fSStefano Zampini { 3018e6e9a74fSStefano Zampini PetscErrorCode ierr; 3019e6e9a74fSStefano Zampini 3020e6e9a74fSStefano Zampini PetscFunctionBegin; 3021e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3022e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3023e6e9a74fSStefano Zampini } 3024e6e9a74fSStefano Zampini 3025e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3026e6e9a74fSStefano Zampini { 3027e6e9a74fSStefano Zampini PetscErrorCode ierr; 3028e6e9a74fSStefano Zampini 3029e6e9a74fSStefano Zampini PetscFunctionBegin; 3030e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3031e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3032e6e9a74fSStefano Zampini } 3033e6e9a74fSStefano Zampini 3034e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3035e6e9a74fSStefano Zampini { 3036e6e9a74fSStefano Zampini PetscErrorCode ierr; 3037e6e9a74fSStefano Zampini 3038e6e9a74fSStefano Zampini PetscFunctionBegin; 3039e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 30409ae82921SPaul Mullowney PetscFunctionReturn(0); 30419ae82921SPaul Mullowney } 30429ae82921SPaul Mullowney 30436fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3044ca45077fSPaul Mullowney { 3045b175d8bbSPaul Mullowney PetscErrorCode ierr; 3046ca45077fSPaul Mullowney 3047ca45077fSPaul Mullowney PetscFunctionBegin; 3048e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3049ca45077fSPaul Mullowney PetscFunctionReturn(0); 3050ca45077fSPaul Mullowney } 3051ca45077fSPaul Mullowney 3052a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3053a0e72f99SJunchao Zhang { 3054a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 3055a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3056a0e72f99SJunchao Zhang } 3057a0e72f99SJunchao Zhang 3058afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3059e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 30609ae82921SPaul Mullowney { 30619ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3062aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 30639ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3064e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3065b175d8bbSPaul Mullowney PetscErrorCode ierr; 3066aa372e3fSPaul Mullowney cusparseStatus_t stat; 3067e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3068e6e9a74fSStefano Zampini PetscBool compressed; 3069afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3070afb2bd1cSJunchao Zhang PetscInt nx,ny; 3071afb2bd1cSJunchao Zhang #endif 30726e111a19SKarl Rupp 30739ae82921SPaul Mullowney PetscFunctionBegin; 3074e8d2b73aSMark Adams if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3075e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 3076afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3077d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3078e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3079e6e9a74fSStefano Zampini } 308034d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 308134d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3082e6e9a74fSStefano Zampini if (!trans) { 30839ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3084e8d2b73aSMark Adams if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3085e6e9a74fSStefano Zampini } else { 30861a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3087e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3088e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3089e6e9a74fSStefano Zampini } else { 30903606e59fSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3091e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3092e6e9a74fSStefano Zampini } 3093e6e9a74fSStefano Zampini } 3094e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3095e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3096213423ffSJunchao Zhang 3097e6e9a74fSStefano Zampini try { 3098e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3099213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3100213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3101afb2bd1cSJunchao Zhang 310285ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3103e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3104afb2bd1cSJunchao Zhang /* z = A x + beta y. 3105afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3106afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3107afb2bd1cSJunchao Zhang */ 3108e6e9a74fSStefano Zampini xptr = xarray; 3109afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3110213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3111afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3112afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3113afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3114afb2bd1cSJunchao Zhang */ 3115afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3116afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3117afb2bd1cSJunchao Zhang nx = mat->num_cols; 3118afb2bd1cSJunchao Zhang ny = mat->num_rows; 3119afb2bd1cSJunchao Zhang } 3120afb2bd1cSJunchao Zhang #endif 3121e6e9a74fSStefano Zampini } else { 3122afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3123afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3124afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3125afb2bd1cSJunchao Zhang */ 3126afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3127e6e9a74fSStefano Zampini dptr = zarray; 3128e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3129afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3130e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3131a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3132e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3133e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3134e6e9a74fSStefano Zampini } 3135afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3136afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3137afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3138afb2bd1cSJunchao Zhang nx = mat->num_rows; 3139afb2bd1cSJunchao Zhang ny = mat->num_cols; 3140afb2bd1cSJunchao Zhang } 3141afb2bd1cSJunchao Zhang #endif 3142e6e9a74fSStefano Zampini } 31439ae82921SPaul Mullowney 3144afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3145aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3146afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3147afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3148afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3149ee7b52eaSHong Zhang cudaError_t cerr; 3150afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3151afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3152afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3153afb2bd1cSJunchao Zhang matstruct->matDescr, 3154afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3155afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3156afb2bd1cSJunchao Zhang cusparse_scalartype, 3157afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3158afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3159afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3160afb2bd1cSJunchao Zhang 3161afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3162afb2bd1cSJunchao Zhang } else { 3163afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3164afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3165afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3166afb2bd1cSJunchao Zhang } 3167afb2bd1cSJunchao Zhang 3168afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 3169afb2bd1cSJunchao Zhang matstruct->alpha_one, 31703606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3171afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3172afb2bd1cSJunchao Zhang beta, 3173afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3174afb2bd1cSJunchao Zhang cusparse_scalartype, 3175afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3176afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3177afb2bd1cSJunchao Zhang #else 31787656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3179e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3180a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3181afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3182aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3183e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 318457d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3185afb2bd1cSJunchao Zhang #endif 3186aa372e3fSPaul Mullowney } else { 3187213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3188afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3189afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3190afb2bd1cSJunchao Zhang #else 3191301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3192e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3193afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3194e6e9a74fSStefano Zampini xptr, beta, 319557d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3196afb2bd1cSJunchao Zhang #endif 3197a65300a6SPaul Mullowney } 3198aa372e3fSPaul Mullowney } 3199958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3200aa372e3fSPaul Mullowney 3201e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3202213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3203213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3204213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3205e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3206213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 32077656d835SStefano Zampini } 3208213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3209c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 32107656d835SStefano Zampini } 32117656d835SStefano Zampini 3212213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3213213423ffSJunchao Zhang if (compressed) { 3214e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3215a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3216a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3217a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3218a0e72f99SJunchao Zhang */ 3219a0e72f99SJunchao Zhang #if 0 3220a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3221a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3222a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3223e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3224c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3225a0e72f99SJunchao Zhang #else 3226a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3227a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3228a0e72f99SJunchao Zhang #endif 3229958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3230e6e9a74fSStefano Zampini } 3231e6e9a74fSStefano Zampini } else { 3232e6e9a74fSStefano Zampini if (yy && yy != zz) { 3233e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3234e6e9a74fSStefano Zampini } 3235e6e9a74fSStefano Zampini } 3236e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3237213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3238213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 32399ae82921SPaul Mullowney } catch(char *ex) { 32409ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 32419ae82921SPaul Mullowney } 3242e6e9a74fSStefano Zampini if (yy) { 3243958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3244e6e9a74fSStefano Zampini } else { 3245e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3246e6e9a74fSStefano Zampini } 32479ae82921SPaul Mullowney PetscFunctionReturn(0); 32489ae82921SPaul Mullowney } 32499ae82921SPaul Mullowney 32506fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3251ca45077fSPaul Mullowney { 3252b175d8bbSPaul Mullowney PetscErrorCode ierr; 32536e111a19SKarl Rupp 3254ca45077fSPaul Mullowney PetscFunctionBegin; 3255e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3256ca45077fSPaul Mullowney PetscFunctionReturn(0); 3257ca45077fSPaul Mullowney } 3258ca45077fSPaul Mullowney 32596fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 32609ae82921SPaul Mullowney { 32619ae82921SPaul Mullowney PetscErrorCode ierr; 3262042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3263042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 32643fa6b06aSMark Adams 3265042217e8SBarry Smith PetscFunctionBegin; 3266042217e8SBarry Smith ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3267042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3268042217e8SBarry Smith cudaError_t cerr; 3269042217e8SBarry Smith 3270042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3271042217e8SBarry Smith cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3272042217e8SBarry Smith cusp->deviceMat = NULL; 3273042217e8SBarry Smith } 32749ae82921SPaul Mullowney PetscFunctionReturn(0); 32759ae82921SPaul Mullowney } 32769ae82921SPaul Mullowney 32779ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3278e057df02SPaul Mullowney /*@ 32799ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3280e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3281e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3282e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3283e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3284e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32859ae82921SPaul Mullowney 3286d083f849SBarry Smith Collective 32879ae82921SPaul Mullowney 32889ae82921SPaul Mullowney Input Parameters: 32899ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32909ae82921SPaul Mullowney . m - number of rows 32919ae82921SPaul Mullowney . n - number of columns 32929ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32939ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32940298fd71SBarry Smith (possibly different for each row) or NULL 32959ae82921SPaul Mullowney 32969ae82921SPaul Mullowney Output Parameter: 32979ae82921SPaul Mullowney . A - the matrix 32989ae82921SPaul Mullowney 32999ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 33009ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 33019ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 33029ae82921SPaul Mullowney 33039ae82921SPaul Mullowney Notes: 33049ae82921SPaul Mullowney If nnz is given then nz is ignored 33059ae82921SPaul Mullowney 33069ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 33079ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 33089ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 33099ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 33109ae82921SPaul Mullowney 33119ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 33120298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 33139ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 33149ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 33159ae82921SPaul Mullowney 33169ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 33179ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 33189ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 33199ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 33209ae82921SPaul Mullowney 33219ae82921SPaul Mullowney Level: intermediate 33229ae82921SPaul Mullowney 3323e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 33249ae82921SPaul Mullowney @*/ 33259ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 33269ae82921SPaul Mullowney { 33279ae82921SPaul Mullowney PetscErrorCode ierr; 33289ae82921SPaul Mullowney 33299ae82921SPaul Mullowney PetscFunctionBegin; 33309ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 33319ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 33329ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 33339ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 33349ae82921SPaul Mullowney PetscFunctionReturn(0); 33359ae82921SPaul Mullowney } 33369ae82921SPaul Mullowney 33376fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 33389ae82921SPaul Mullowney { 33399ae82921SPaul Mullowney PetscErrorCode ierr; 3340ab25e6cbSDominic Meiser 33419ae82921SPaul Mullowney PetscFunctionBegin; 33429ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3343470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 33449ae82921SPaul Mullowney } else { 3345470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3346aa372e3fSPaul Mullowney } 3347c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3348ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3349365b711fSMark Adams ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3350ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3351ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3352fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3353ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 33547e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 33557e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3356ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 33579ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 33589ae82921SPaul Mullowney PetscFunctionReturn(0); 33599ae82921SPaul Mullowney } 33609ae82921SPaul Mullowney 3361ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 336295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 33639ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 33649ff858a8SKarl Rupp { 33659ff858a8SKarl Rupp PetscErrorCode ierr; 33669ff858a8SKarl Rupp 33679ff858a8SKarl Rupp PetscFunctionBegin; 33689ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3369ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 33709ff858a8SKarl Rupp PetscFunctionReturn(0); 33719ff858a8SKarl Rupp } 33729ff858a8SKarl Rupp 3373039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 337495639643SRichard Tran Mills { 3375e6e9a74fSStefano Zampini PetscErrorCode ierr; 3376a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3377039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3378039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3379039c6fbaSStefano Zampini PetscScalar *ay; 3380039c6fbaSStefano Zampini const PetscScalar *ax; 3381039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3382e6e9a74fSStefano Zampini 338395639643SRichard Tran Mills PetscFunctionBegin; 3384a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3385a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3386039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3387a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3388a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3389a587d139SMark PetscFunctionReturn(0); 339095639643SRichard Tran Mills } 3391039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3392a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3393a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3394e8d2b73aSMark Adams if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3395e8d2b73aSMark Adams if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3396039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3397039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3398039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3399039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3400039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3401039c6fbaSStefano Zampini if (eq) { 3402039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3403039c6fbaSStefano Zampini } 3404039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3405039c6fbaSStefano Zampini } 3406d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3407d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3408039c6fbaSStefano Zampini 3409039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3410039c6fbaSStefano Zampini cusparseStatus_t stat; 3411039c6fbaSStefano Zampini PetscScalar b = 1.0; 3412039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3413039c6fbaSStefano Zampini size_t bufferSize; 3414039c6fbaSStefano Zampini void *buffer; 3415ee7b52eaSHong Zhang cudaError_t cerr; 3416039c6fbaSStefano Zampini #endif 3417039c6fbaSStefano Zampini 3418039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3419039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3420039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3421039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3422039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3423039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3424039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3425039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3426039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3427039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3428039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3429039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3430039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3431039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3432039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3433039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3434039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3435039c6fbaSStefano Zampini #else 3436039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3437039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3438039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3439039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3440039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3441039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3442039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3443039c6fbaSStefano Zampini #endif 3444039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3445039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3446039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3447039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3448039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3449a587d139SMark cublasHandle_t cublasv2handle; 3450039c6fbaSStefano Zampini cublasStatus_t berr; 3451a587d139SMark PetscBLASInt one = 1, bnz = 1; 3452039c6fbaSStefano Zampini 3453039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3454039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3455a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3456a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3457a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3458039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3459a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3460a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3461039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3462039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3463a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3464039c6fbaSStefano Zampini } else { 3465a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3466d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3467a587d139SMark } 346895639643SRichard Tran Mills PetscFunctionReturn(0); 346995639643SRichard Tran Mills } 347095639643SRichard Tran Mills 347133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 347233c9ba73SStefano Zampini { 347333c9ba73SStefano Zampini PetscErrorCode ierr; 347433c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 347533c9ba73SStefano Zampini PetscScalar *ay; 347633c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 347733c9ba73SStefano Zampini cublasStatus_t berr; 347833c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 347933c9ba73SStefano Zampini 348033c9ba73SStefano Zampini PetscFunctionBegin; 348133c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 348233c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 348333c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 348433c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 348533c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 348633c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 348733c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348833c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 348933c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 349033c9ba73SStefano Zampini PetscFunctionReturn(0); 349133c9ba73SStefano Zampini } 349233c9ba73SStefano Zampini 34933fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34943fa6b06aSMark Adams { 34953fa6b06aSMark Adams PetscErrorCode ierr; 34967e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3497a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34987e8381f9SStefano Zampini 34993fa6b06aSMark Adams PetscFunctionBegin; 35003fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 35013fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 35027e8381f9SStefano Zampini if (spptr->mat) { 35037e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 35047e8381f9SStefano Zampini if (matrix->values) { 35057e8381f9SStefano Zampini both = PETSC_TRUE; 35067e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 35077e8381f9SStefano Zampini } 35087e8381f9SStefano Zampini } 35097e8381f9SStefano Zampini if (spptr->matTranspose) { 35107e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 35117e8381f9SStefano Zampini if (matrix->values) { 35127e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 35137e8381f9SStefano Zampini } 35147e8381f9SStefano Zampini } 35153fa6b06aSMark Adams } 3516a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3517a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3518a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 35197e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3520a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 35213fa6b06aSMark Adams PetscFunctionReturn(0); 35223fa6b06aSMark Adams } 35233fa6b06aSMark Adams 3524a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3525a587d139SMark { 3526a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3527a587d139SMark PetscErrorCode ierr; 3528a587d139SMark 3529a587d139SMark PetscFunctionBegin; 3530*9a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 3531*9a14fc28SStefano Zampini A->boundtocpu = flg; 3532*9a14fc28SStefano Zampini PetscFunctionReturn(0); 3533*9a14fc28SStefano Zampini } 3534a587d139SMark if (flg) { 3535a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3536a587d139SMark 353733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3538a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3539a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3540a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3541a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3542a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3543a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3544a587d139SMark A->ops->multhermitiantranspose = NULL; 3545a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3546fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 354767a45760SJunchao Zhang ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3548c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3549a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3550a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3551a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3552a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3553a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3554fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3555a587d139SMark } else { 355633c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3557a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3558a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3559a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3560a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3561a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3562a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3563a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3564a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3565fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 356667a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 356767a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 356867a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 356967a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 357067a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 357167a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3572c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3573a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3574a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578a587d139SMark } 3579a587d139SMark A->boundtocpu = flg; 3580ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3581ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3582ea500dcfSRichard Tran Mills } else { 3583ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3584ea500dcfSRichard Tran Mills } 3585a587d139SMark PetscFunctionReturn(0); 3586a587d139SMark } 3587a587d139SMark 358849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 35899ae82921SPaul Mullowney { 35909ae82921SPaul Mullowney PetscErrorCode ierr; 3591aa372e3fSPaul Mullowney cusparseStatus_t stat; 359249735bf3SStefano Zampini Mat B; 35939ae82921SPaul Mullowney 35949ae82921SPaul Mullowney PetscFunctionBegin; 3595a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 359649735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 359749735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 359849735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 359949735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 360049735bf3SStefano Zampini } 360149735bf3SStefano Zampini B = *newmat; 360249735bf3SStefano Zampini 360334136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 360434136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 360534136279SStefano Zampini 360649735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 36079ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3608e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3609e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3610e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3611a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 36121a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3613d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3614a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3615a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3616a435da06SStefano Zampini #else 3617d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3618a435da06SStefano Zampini #endif 3619d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3620d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3621d8132acaSStefano Zampini #endif 36221a2c6b5cSJunchao Zhang B->spptr = spptr; 36239ae82921SPaul Mullowney } else { 3624e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3625e6e9a74fSStefano Zampini 3626e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3627e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3628a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3629e6e9a74fSStefano Zampini B->spptr = spptr; 36309ae82921SPaul Mullowney } 3631e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 363249735bf3SStefano Zampini } 3633693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 36349ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 36351a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 36369ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 363795639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3638693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 36392205254eSKarl Rupp 3640e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 36419ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3642bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3643ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 3644ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3645ae48a8d0SStefano Zampini #endif 3646365b711fSMark Adams ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 36479ae82921SPaul Mullowney PetscFunctionReturn(0); 36489ae82921SPaul Mullowney } 36499ae82921SPaul Mullowney 365002fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 365102fe1965SBarry Smith { 365202fe1965SBarry Smith PetscErrorCode ierr; 365302fe1965SBarry Smith 365402fe1965SBarry Smith PetscFunctionBegin; 365502fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 36560ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 365702fe1965SBarry Smith PetscFunctionReturn(0); 365802fe1965SBarry Smith } 365902fe1965SBarry Smith 36603ca39a21SBarry Smith /*MC 3661e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3662e057df02SPaul Mullowney 3663e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 36642692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 36652692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3666e057df02SPaul Mullowney 3667e057df02SPaul Mullowney Options Database Keys: 3668e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3669aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3670a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3671365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3672e057df02SPaul Mullowney 3673e057df02SPaul Mullowney Level: beginner 3674e057df02SPaul Mullowney 36758468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3676e057df02SPaul Mullowney M*/ 36777f756511SDominic Meiser 3678bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 36790f39cd5aSBarry Smith 36803ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 368142c9c57cSBarry Smith { 368242c9c57cSBarry Smith PetscErrorCode ierr; 368342c9c57cSBarry Smith 368442c9c57cSBarry Smith PetscFunctionBegin; 3685bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 36863ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 36873ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 36883ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 36893ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690bddcd29dSMark Adams 369142c9c57cSBarry Smith PetscFunctionReturn(0); 369242c9c57cSBarry Smith } 369329b38603SBarry Smith 3694470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 36957f756511SDominic Meiser { 3696e6e9a74fSStefano Zampini PetscErrorCode ierr; 36977f756511SDominic Meiser cusparseStatus_t stat; 36987f756511SDominic Meiser 36997f756511SDominic Meiser PetscFunctionBegin; 37007f756511SDominic Meiser if (*cusparsestruct) { 3701e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3702e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 37037f756511SDominic Meiser delete (*cusparsestruct)->workVector; 370481902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 37057e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 37067e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3707a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 37087e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3709e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 37107f756511SDominic Meiser } 37117f756511SDominic Meiser PetscFunctionReturn(0); 37127f756511SDominic Meiser } 37137f756511SDominic Meiser 37147f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 37157f756511SDominic Meiser { 37167f756511SDominic Meiser PetscFunctionBegin; 37177f756511SDominic Meiser if (*mat) { 37187f756511SDominic Meiser delete (*mat)->values; 37197f756511SDominic Meiser delete (*mat)->column_indices; 37207f756511SDominic Meiser delete (*mat)->row_offsets; 37217f756511SDominic Meiser delete *mat; 37227f756511SDominic Meiser *mat = 0; 37237f756511SDominic Meiser } 37247f756511SDominic Meiser PetscFunctionReturn(0); 37257f756511SDominic Meiser } 37267f756511SDominic Meiser 3727470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 37287f756511SDominic Meiser { 37297f756511SDominic Meiser cusparseStatus_t stat; 37307f756511SDominic Meiser PetscErrorCode ierr; 37317f756511SDominic Meiser 37327f756511SDominic Meiser PetscFunctionBegin; 37337f756511SDominic Meiser if (*trifactor) { 373457d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3735afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 37367f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 37371b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 37382cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3739afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 37401b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3741afb2bd1cSJunchao Zhang #endif 3742da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 37437f756511SDominic Meiser } 37447f756511SDominic Meiser PetscFunctionReturn(0); 37457f756511SDominic Meiser } 37467f756511SDominic Meiser 3747470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 37487f756511SDominic Meiser { 37497f756511SDominic Meiser CsrMatrix *mat; 37507f756511SDominic Meiser cusparseStatus_t stat; 37517f756511SDominic Meiser cudaError_t err; 37527f756511SDominic Meiser 37537f756511SDominic Meiser PetscFunctionBegin; 37547f756511SDominic Meiser if (*matstruct) { 37557f756511SDominic Meiser if ((*matstruct)->mat) { 37567f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3757afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3758afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3759afb2bd1cSJunchao Zhang #else 37607f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 376157d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3762afb2bd1cSJunchao Zhang #endif 37637f756511SDominic Meiser } else { 37647f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 37657f756511SDominic Meiser CsrMatrix_Destroy(&mat); 37667f756511SDominic Meiser } 37677f756511SDominic Meiser } 376857d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 37697f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3770afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 37717656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 37727656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3773afb2bd1cSJunchao Zhang 3774afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3775afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3776afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3777afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3778afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3779afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3780afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3781afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3782afb2bd1cSJunchao Zhang } 3783afb2bd1cSJunchao Zhang } 3784afb2bd1cSJunchao Zhang #endif 37857f756511SDominic Meiser delete *matstruct; 37867e8381f9SStefano Zampini *matstruct = NULL; 37877f756511SDominic Meiser } 37887f756511SDominic Meiser PetscFunctionReturn(0); 37897f756511SDominic Meiser } 37907f756511SDominic Meiser 3791e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 37927f756511SDominic Meiser { 3793e6e9a74fSStefano Zampini PetscErrorCode ierr; 3794e6e9a74fSStefano Zampini 37957f756511SDominic Meiser PetscFunctionBegin; 37967f756511SDominic Meiser if (*trifactors) { 3797e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3798e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3799e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3800e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 38017f756511SDominic Meiser delete (*trifactors)->rpermIndices; 38027f756511SDominic Meiser delete (*trifactors)->cpermIndices; 38037f756511SDominic Meiser delete (*trifactors)->workVector; 38047e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 38057e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 38067e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3807bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3808bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3809e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3810ccdfe979SStefano Zampini } 3811ccdfe979SStefano Zampini PetscFunctionReturn(0); 3812ccdfe979SStefano Zampini } 3813ccdfe979SStefano Zampini 3814ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3815ccdfe979SStefano Zampini { 3816e6e9a74fSStefano Zampini PetscErrorCode ierr; 3817ccdfe979SStefano Zampini cusparseHandle_t handle; 3818ccdfe979SStefano Zampini cusparseStatus_t stat; 3819ccdfe979SStefano Zampini 3820ccdfe979SStefano Zampini PetscFunctionBegin; 3821ccdfe979SStefano Zampini if (*trifactors) { 3822e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 38237f756511SDominic Meiser if (handle = (*trifactors)->handle) { 382457d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 38257f756511SDominic Meiser } 3826e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 38277f756511SDominic Meiser } 38287f756511SDominic Meiser PetscFunctionReturn(0); 38297f756511SDominic Meiser } 38307e8381f9SStefano Zampini 38317e8381f9SStefano Zampini struct IJCompare 38327e8381f9SStefano Zampini { 38337e8381f9SStefano Zampini __host__ __device__ 38347e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 38357e8381f9SStefano Zampini { 38367e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 38377e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 38387e8381f9SStefano Zampini return false; 38397e8381f9SStefano Zampini } 38407e8381f9SStefano Zampini }; 38417e8381f9SStefano Zampini 38427e8381f9SStefano Zampini struct IJEqual 38437e8381f9SStefano Zampini { 38447e8381f9SStefano Zampini __host__ __device__ 38457e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 38467e8381f9SStefano Zampini { 38477e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 38487e8381f9SStefano Zampini return true; 38497e8381f9SStefano Zampini } 38507e8381f9SStefano Zampini }; 38517e8381f9SStefano Zampini 38527e8381f9SStefano Zampini struct IJDiff 38537e8381f9SStefano Zampini { 38547e8381f9SStefano Zampini __host__ __device__ 38557e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 38567e8381f9SStefano Zampini { 38577e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 38587e8381f9SStefano Zampini } 38597e8381f9SStefano Zampini }; 38607e8381f9SStefano Zampini 38617e8381f9SStefano Zampini struct IJSum 38627e8381f9SStefano Zampini { 38637e8381f9SStefano Zampini __host__ __device__ 38647e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 38657e8381f9SStefano Zampini { 38667e8381f9SStefano Zampini return t1||t2; 38677e8381f9SStefano Zampini } 38687e8381f9SStefano Zampini }; 38697e8381f9SStefano Zampini 38707e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3871e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 38727e8381f9SStefano Zampini { 38737e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3874fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3875bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 387608391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 38777e8381f9SStefano Zampini CsrMatrix *matrix; 38787e8381f9SStefano Zampini PetscErrorCode ierr; 38797e8381f9SStefano Zampini PetscInt n; 38807e8381f9SStefano Zampini 38817e8381f9SStefano Zampini PetscFunctionBegin; 38827e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 38837e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 38847e8381f9SStefano Zampini if (!cusp->cooPerm) { 38857e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 38867e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 38877e8381f9SStefano Zampini PetscFunctionReturn(0); 38887e8381f9SStefano Zampini } 38897e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 38907e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3891e61fc153SStefano Zampini if (!v) { 3892e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3893e61fc153SStefano Zampini goto finalize; 38947e8381f9SStefano Zampini } 3895e61fc153SStefano Zampini n = cusp->cooPerm->size(); 389608391a17SStefano Zampini if (isCudaMem(v)) { 389708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 389808391a17SStefano Zampini } else { 3899e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3900e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 390108391a17SStefano Zampini d_v = cooPerm_v->data(); 3902e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 390308391a17SStefano Zampini } 3904bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3905e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3906ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3907bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 390808391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3909ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3910ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3911ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3912ddea5d60SJunchao Zhang */ 3913e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3914e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3915e61fc153SStefano Zampini delete cooPerm_w; 39167e8381f9SStefano Zampini } else { 3917ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 391808391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 39197e8381f9SStefano Zampini matrix->values->begin())); 392008391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 39217e8381f9SStefano Zampini matrix->values->end())); 3922ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 39237e8381f9SStefano Zampini } 39247e8381f9SStefano Zampini } else { 3925e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 392608391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3927e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 39287e8381f9SStefano Zampini } else { 392908391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 39307e8381f9SStefano Zampini matrix->values->begin())); 393108391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 39327e8381f9SStefano Zampini matrix->values->end())); 39337e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 39347e8381f9SStefano Zampini } 39357e8381f9SStefano Zampini } 3936bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3937e61fc153SStefano Zampini finalize: 3938e61fc153SStefano Zampini delete cooPerm_v; 39397e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3940e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3941fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3942fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3943fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3944fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3945fcdce8c4SStefano Zampini a->reallocs = 0; 3946fcdce8c4SStefano Zampini A->info.mallocs += 0; 3947fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3948fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3949fcdce8c4SStefano Zampini A->num_ass++; 39507e8381f9SStefano Zampini PetscFunctionReturn(0); 39517e8381f9SStefano Zampini } 39527e8381f9SStefano Zampini 3953a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3954a49f1ed0SStefano Zampini { 3955a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3956a49f1ed0SStefano Zampini PetscErrorCode ierr; 3957a49f1ed0SStefano Zampini 3958a49f1ed0SStefano Zampini PetscFunctionBegin; 3959a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3960a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3961a49f1ed0SStefano Zampini if (destroy) { 3962a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3963a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3964a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3965a49f1ed0SStefano Zampini } 39661a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3967a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3968a49f1ed0SStefano Zampini } 3969a49f1ed0SStefano Zampini 39707e8381f9SStefano Zampini #include <thrust/binary_search.h> 3971e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 39727e8381f9SStefano Zampini { 39737e8381f9SStefano Zampini PetscErrorCode ierr; 39747e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 39757e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 39767e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 39777e8381f9SStefano Zampini cudaError_t cerr; 39787e8381f9SStefano Zampini 39797e8381f9SStefano Zampini PetscFunctionBegin; 39807e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 39817e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 39827e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 39837e8381f9SStefano Zampini if (n != cooPerm_n) { 39847e8381f9SStefano Zampini delete cusp->cooPerm; 39857e8381f9SStefano Zampini delete cusp->cooPerm_a; 39867e8381f9SStefano Zampini cusp->cooPerm = NULL; 39877e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 39887e8381f9SStefano Zampini } 39897e8381f9SStefano Zampini if (n) { 39907e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 39917e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 39927e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 39937e8381f9SStefano Zampini 39947e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 39957e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 39967e8381f9SStefano Zampini 39977e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 39987e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 39997e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 4000ddea5d60SJunchao Zhang 4001ddea5d60SJunchao Zhang /* Ex. 4002ddea5d60SJunchao Zhang n = 6 4003ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4004ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4005ddea5d60SJunchao Zhang */ 40067e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 40077e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 40087e8381f9SStefano Zampini 400908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 40107e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4011ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4012ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 40137e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 40147e8381f9SStefano Zampini 4015ddea5d60SJunchao Zhang /* 4016ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4017ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4018ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4019ddea5d60SJunchao Zhang */ 4020ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4021ddea5d60SJunchao Zhang 4022ddea5d60SJunchao Zhang /* 4023ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4024ddea5d60SJunchao Zhang ^ekey 4025ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4026ddea5d60SJunchao Zhang ^nekye 4027ddea5d60SJunchao Zhang */ 40287e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 40297e8381f9SStefano Zampini delete cusp->cooPerm_a; 40307e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4031ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4032ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4033ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4034ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4035ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 40367e8381f9SStefano Zampini w[0] = 0; 4037ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4038ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 40397e8381f9SStefano Zampini } 40407e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4041ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4042ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4043ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 404408391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 40457e8381f9SStefano Zampini 40467e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 40477e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 40487e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 40497e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 40507e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4051ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 40527e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 40537e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4054fcdce8c4SStefano Zampini a->rmax = 0; 40557e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 40567e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 40577e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 40587e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 40597e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 40607e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 40617e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 40627e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 40637e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4064fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 40657e8381f9SStefano Zampini } 4066fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 40677e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 40687e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4069fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 40707e8381f9SStefano Zampini } else { 40717e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 40727e8381f9SStefano Zampini } 4073e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 40747e8381f9SStefano Zampini 40757e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4076e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 4077e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 40787e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 40797e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 40807e8381f9SStefano Zampini A->nonzerostate++; 40817e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4082a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 40837e8381f9SStefano Zampini 40847e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 40857e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 40867e8381f9SStefano Zampini PetscFunctionReturn(0); 40877e8381f9SStefano Zampini } 4088ed502f03SStefano Zampini 40895b7e41feSStefano Zampini /*@C 40905b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 40915b7e41feSStefano Zampini 40925b7e41feSStefano Zampini Not collective 40935b7e41feSStefano Zampini 40945b7e41feSStefano Zampini Input Parameters: 40955b7e41feSStefano Zampini + A - the matrix 40965b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40975b7e41feSStefano Zampini 40985b7e41feSStefano Zampini Output Parameters: 40995b7e41feSStefano Zampini + ia - the CSR row pointers 41005b7e41feSStefano Zampini - ja - the CSR column indices 41015b7e41feSStefano Zampini 41025b7e41feSStefano Zampini Level: developer 41035b7e41feSStefano Zampini 41045b7e41feSStefano Zampini Notes: 41055b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 41065b7e41feSStefano Zampini 41075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 41085b7e41feSStefano Zampini @*/ 41095f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41105f101d05SStefano Zampini { 41115f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 41125f101d05SStefano Zampini CsrMatrix *csr; 41135f101d05SStefano Zampini PetscErrorCode ierr; 41145f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41155f101d05SStefano Zampini 41165f101d05SStefano Zampini PetscFunctionBegin; 41175f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41185f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 41195f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41205f101d05SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41215f101d05SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 41225f101d05SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 41235f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 41245f101d05SStefano Zampini if (i) { 41255f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 41265f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 41275f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 41285f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 41295f101d05SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 41305f101d05SStefano Zampini } 41315f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 41325f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 41335f101d05SStefano Zampini } 41345f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 41355f101d05SStefano Zampini PetscFunctionReturn(0); 41365f101d05SStefano Zampini } 41375f101d05SStefano Zampini 41385b7e41feSStefano Zampini /*@C 41395b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 41405b7e41feSStefano Zampini 41415b7e41feSStefano Zampini Not collective 41425b7e41feSStefano Zampini 41435b7e41feSStefano Zampini Input Parameters: 41445b7e41feSStefano Zampini + A - the matrix 41455b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 41465b7e41feSStefano Zampini 41475b7e41feSStefano Zampini Output Parameters: 41485b7e41feSStefano Zampini + ia - the CSR row pointers 41495b7e41feSStefano Zampini - ja - the CSR column indices 41505b7e41feSStefano Zampini 41515b7e41feSStefano Zampini Level: developer 41525b7e41feSStefano Zampini 41535b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 41545b7e41feSStefano Zampini @*/ 41555f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41565f101d05SStefano Zampini { 41575f101d05SStefano Zampini PetscFunctionBegin; 41585f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41595f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41605f101d05SStefano Zampini if (i) *i = NULL; 41615f101d05SStefano Zampini if (j) *j = NULL; 41625f101d05SStefano Zampini PetscFunctionReturn(0); 41635f101d05SStefano Zampini } 41645f101d05SStefano Zampini 41655b7e41feSStefano Zampini /*@C 41665b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41675b7e41feSStefano Zampini 41685b7e41feSStefano Zampini Not Collective 41695b7e41feSStefano Zampini 41705b7e41feSStefano Zampini Input Parameter: 41715b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41725b7e41feSStefano Zampini 41735b7e41feSStefano Zampini Output Parameter: 41745b7e41feSStefano Zampini . a - pointer to the device data 41755b7e41feSStefano Zampini 41765b7e41feSStefano Zampini Level: developer 41775b7e41feSStefano Zampini 41785b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41795b7e41feSStefano Zampini 41805b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 41815b7e41feSStefano Zampini @*/ 4182ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4183ed502f03SStefano Zampini { 4184ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4185ed502f03SStefano Zampini CsrMatrix *csr; 4186ed502f03SStefano Zampini PetscErrorCode ierr; 4187ed502f03SStefano Zampini 4188ed502f03SStefano Zampini PetscFunctionBegin; 4189ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4190ed502f03SStefano Zampini PetscValidPointer(a,2); 4191ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4192ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4193ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 419433c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4195ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4196ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4197ed502f03SStefano Zampini *a = csr->values->data().get(); 4198ed502f03SStefano Zampini PetscFunctionReturn(0); 4199ed502f03SStefano Zampini } 4200ed502f03SStefano Zampini 42015b7e41feSStefano Zampini /*@C 42025b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 42035b7e41feSStefano Zampini 42045b7e41feSStefano Zampini Not Collective 42055b7e41feSStefano Zampini 42065b7e41feSStefano Zampini Input Parameter: 42075b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42085b7e41feSStefano Zampini 42095b7e41feSStefano Zampini Output Parameter: 42105b7e41feSStefano Zampini . a - pointer to the device data 42115b7e41feSStefano Zampini 42125b7e41feSStefano Zampini Level: developer 42135b7e41feSStefano Zampini 42145b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 42155b7e41feSStefano Zampini @*/ 4216ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4217ed502f03SStefano Zampini { 4218ed502f03SStefano Zampini PetscFunctionBegin; 4219ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4220ed502f03SStefano Zampini PetscValidPointer(a,2); 4221ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4222ed502f03SStefano Zampini *a = NULL; 4223ed502f03SStefano Zampini PetscFunctionReturn(0); 4224ed502f03SStefano Zampini } 4225ed502f03SStefano Zampini 42265b7e41feSStefano Zampini /*@C 42275b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42285b7e41feSStefano Zampini 42295b7e41feSStefano Zampini Not Collective 42305b7e41feSStefano Zampini 42315b7e41feSStefano Zampini Input Parameter: 42325b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42335b7e41feSStefano Zampini 42345b7e41feSStefano Zampini Output Parameter: 42355b7e41feSStefano Zampini . a - pointer to the device data 42365b7e41feSStefano Zampini 42375b7e41feSStefano Zampini Level: developer 42385b7e41feSStefano Zampini 42395b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 42405b7e41feSStefano Zampini 42415b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 42425b7e41feSStefano Zampini @*/ 4243039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4244039c6fbaSStefano Zampini { 4245039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4246039c6fbaSStefano Zampini CsrMatrix *csr; 4247039c6fbaSStefano Zampini PetscErrorCode ierr; 4248039c6fbaSStefano Zampini 4249039c6fbaSStefano Zampini PetscFunctionBegin; 4250039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4251039c6fbaSStefano Zampini PetscValidPointer(a,2); 4252039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4253039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4254039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 425533c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4256039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4257039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4258039c6fbaSStefano Zampini *a = csr->values->data().get(); 4259039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4260a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4261039c6fbaSStefano Zampini PetscFunctionReturn(0); 4262039c6fbaSStefano Zampini } 42635b7e41feSStefano Zampini /*@C 42645b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4265039c6fbaSStefano Zampini 42665b7e41feSStefano Zampini Not Collective 42675b7e41feSStefano Zampini 42685b7e41feSStefano Zampini Input Parameter: 42695b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42705b7e41feSStefano Zampini 42715b7e41feSStefano Zampini Output Parameter: 42725b7e41feSStefano Zampini . a - pointer to the device data 42735b7e41feSStefano Zampini 42745b7e41feSStefano Zampini Level: developer 42755b7e41feSStefano Zampini 42765b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 42775b7e41feSStefano Zampini @*/ 4278039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4279039c6fbaSStefano Zampini { 4280039c6fbaSStefano Zampini PetscErrorCode ierr; 4281039c6fbaSStefano Zampini 4282039c6fbaSStefano Zampini PetscFunctionBegin; 4283039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4284039c6fbaSStefano Zampini PetscValidPointer(a,2); 4285039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4286039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4287039c6fbaSStefano Zampini *a = NULL; 4288039c6fbaSStefano Zampini PetscFunctionReturn(0); 4289039c6fbaSStefano Zampini } 4290039c6fbaSStefano Zampini 42915b7e41feSStefano Zampini /*@C 42925b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42935b7e41feSStefano Zampini 42945b7e41feSStefano Zampini Not Collective 42955b7e41feSStefano Zampini 42965b7e41feSStefano Zampini Input Parameter: 42975b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42985b7e41feSStefano Zampini 42995b7e41feSStefano Zampini Output Parameter: 43005b7e41feSStefano Zampini . a - pointer to the device data 43015b7e41feSStefano Zampini 43025b7e41feSStefano Zampini Level: developer 43035b7e41feSStefano Zampini 43045b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 43055b7e41feSStefano Zampini 43065b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 43075b7e41feSStefano Zampini @*/ 4308ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4309ed502f03SStefano Zampini { 4310ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4311ed502f03SStefano Zampini CsrMatrix *csr; 4312a49f1ed0SStefano Zampini PetscErrorCode ierr; 4313ed502f03SStefano Zampini 4314ed502f03SStefano Zampini PetscFunctionBegin; 4315ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4316ed502f03SStefano Zampini PetscValidPointer(a,2); 4317ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4318ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 431933c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4320ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4321ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4322ed502f03SStefano Zampini *a = csr->values->data().get(); 4323039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4324a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4325ed502f03SStefano Zampini PetscFunctionReturn(0); 4326ed502f03SStefano Zampini } 4327ed502f03SStefano Zampini 43285b7e41feSStefano Zampini /*@C 43295b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 43305b7e41feSStefano Zampini 43315b7e41feSStefano Zampini Not Collective 43325b7e41feSStefano Zampini 43335b7e41feSStefano Zampini Input Parameter: 43345b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43355b7e41feSStefano Zampini 43365b7e41feSStefano Zampini Output Parameter: 43375b7e41feSStefano Zampini . a - pointer to the device data 43385b7e41feSStefano Zampini 43395b7e41feSStefano Zampini Level: developer 43405b7e41feSStefano Zampini 43415b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 43425b7e41feSStefano Zampini @*/ 4343ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4344ed502f03SStefano Zampini { 4345ed502f03SStefano Zampini PetscErrorCode ierr; 4346ed502f03SStefano Zampini 4347ed502f03SStefano Zampini PetscFunctionBegin; 4348ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4349ed502f03SStefano Zampini PetscValidPointer(a,2); 4350ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4351ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4352ed502f03SStefano Zampini *a = NULL; 4353ed502f03SStefano Zampini PetscFunctionReturn(0); 4354ed502f03SStefano Zampini } 4355ed502f03SStefano Zampini 4356ed502f03SStefano Zampini struct IJCompare4 4357ed502f03SStefano Zampini { 4358ed502f03SStefano Zampini __host__ __device__ 43592ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4360ed502f03SStefano Zampini { 4361ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4362ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4363ed502f03SStefano Zampini return false; 4364ed502f03SStefano Zampini } 4365ed502f03SStefano Zampini }; 4366ed502f03SStefano Zampini 43678909a122SStefano Zampini struct Shift 43688909a122SStefano Zampini { 4369ed502f03SStefano Zampini int _shift; 4370ed502f03SStefano Zampini 4371ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4372ed502f03SStefano Zampini __host__ __device__ 4373ed502f03SStefano Zampini inline int operator() (const int &c) 4374ed502f03SStefano Zampini { 4375ed502f03SStefano Zampini return c + _shift; 4376ed502f03SStefano Zampini } 4377ed502f03SStefano Zampini }; 4378ed502f03SStefano Zampini 4379ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4380ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4381ed502f03SStefano Zampini { 4382ed502f03SStefano Zampini PetscErrorCode ierr; 4383ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4384ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4385ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4386ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4387ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4388ed502f03SStefano Zampini cusparseStatus_t stat; 4389ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4390ed502f03SStefano Zampini cudaError_t cerr; 4391ed502f03SStefano Zampini 4392ed502f03SStefano Zampini PetscFunctionBegin; 4393ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4394ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4395ed502f03SStefano Zampini PetscValidPointer(C,4); 4396ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4397ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4398ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4399ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4400ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4401ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4402ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4403ed502f03SStefano Zampini m = A->rmap->n; 4404ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 4405ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4406ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4407ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4408ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4409ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4410ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4411ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4412ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4413ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4414ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4415ed502f03SStefano Zampini c->compressedrow.i = NULL; 4416ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4417ed502f03SStefano Zampini Ccusp->workVector = NULL; 4418ed502f03SStefano Zampini Ccusp->nrows = m; 4419ed502f03SStefano Zampini Ccusp->mat = Cmat; 4420ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4421ed502f03SStefano Zampini Ccsr->num_rows = m; 4422ed502f03SStefano Zampini Ccsr->num_cols = n; 4423ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4424ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4425ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4426ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4427ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4428ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4429ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4430ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4431ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4432ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4433ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4434ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4435ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4436ed502f03SStefano Zampini 4437ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4438ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4439ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4440ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4441ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4442ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4443ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4444ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4445ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4446ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4447ed502f03SStefano Zampini if (c->nz) { 44482ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 44492ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 44502ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 44512ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 44522ed87e7eSStefano Zampini 4453ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4454ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4455ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4456ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4457ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4458ed502f03SStefano Zampini } 44592ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 44602ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4461ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4462ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4463ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4464ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4465ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4466ed502f03SStefano Zampini } 44672ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 44682ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 4469ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 44702ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 44712ed87e7eSStefano Zampini Aroff->data().get(), 44722ed87e7eSStefano Zampini Annz, 44732ed87e7eSStefano Zampini m, 44742ed87e7eSStefano Zampini Acoo->data().get(), 44752ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4476ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 44772ed87e7eSStefano Zampini Broff->data().get(), 4478ed502f03SStefano Zampini Bnnz, 4479ed502f03SStefano Zampini m, 44802ed87e7eSStefano Zampini Bcoo->data().get(), 4481ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 44822ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 44832ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 44842ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 44858909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4486ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4487ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 44888909a122SStefano Zampini #else 44898909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 44908909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 44918909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 44928909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 44938909a122SStefano Zampini #endif 44942ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 44952ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 44962ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 44972ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 44982ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 44992ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4500ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4501ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4502ed502f03SStefano Zampini thrust::advance(p2,Annz); 45032ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 45048909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 45058909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 45068909a122SStefano Zampini #endif 45072ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 45082ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 45092ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 45102ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 45112ed87e7eSStefano Zampini #else 45122ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 45132ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 45142ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 45152ed87e7eSStefano Zampini #endif 4516ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 45172ed87e7eSStefano Zampini Ccoo->data().get(), 4518ed502f03SStefano Zampini c->nz, 4519ed502f03SStefano Zampini m, 4520ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4521ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4522ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 45232ed87e7eSStefano Zampini delete wPerm; 45242ed87e7eSStefano Zampini delete Acoo; 45252ed87e7eSStefano Zampini delete Bcoo; 45262ed87e7eSStefano Zampini delete Ccoo; 4527ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4528ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4529ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4530ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4531ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4532ed502f03SStefano Zampini #endif 45331a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 45343606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 45353606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4536ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4537ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4538ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4539ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4540ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4541ed502f03SStefano Zampini 45421a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 45431a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4544a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4545ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4546ed502f03SStefano Zampini CmatT->mat = CcsrT; 4547ed502f03SStefano Zampini CcsrT->num_rows = n; 4548ed502f03SStefano Zampini CcsrT->num_cols = m; 4549ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4550ed502f03SStefano Zampini 4551ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4552ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4553ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4554ed502f03SStefano Zampini 4555ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4556ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4557ed502f03SStefano Zampini if (AT) { 4558ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4559ed502f03SStefano Zampini thrust::advance(rT,-1); 4560ed502f03SStefano Zampini } 4561ed502f03SStefano Zampini if (BT) { 4562ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4563ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4564ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4565ed502f03SStefano Zampini } 4566ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4567ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4568ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4569ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4570ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4571ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4572ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4573ed502f03SStefano Zampini 4574ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4575ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4576ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4577ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4578ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4579ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4580ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4581ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4582ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4583ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4584ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4585ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4586ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4587ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4588ed502f03SStefano Zampini #endif 4589ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4590ed502f03SStefano Zampini } 4591ed502f03SStefano Zampini } 4592ed502f03SStefano Zampini 4593ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4594ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4595ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4596ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4597ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4598ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4599ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4600ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4601ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4602ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4603ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4604ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4605ed502f03SStefano Zampini } else { 4606ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4607ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4608ed502f03SStefano Zampini } 4609ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4610ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4611ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4612ed502f03SStefano Zampini c->maxnz = c->nz; 4613ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4614ed502f03SStefano Zampini c->rmax = 0; 4615ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4616ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4617ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4618ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4619ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4620ed502f03SStefano Zampini } 4621ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4622ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4623ed502f03SStefano Zampini (*C)->nonzerostate++; 4624ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4625ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4626ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4627ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4628ed502f03SStefano Zampini } else { 4629ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4630ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4631ed502f03SStefano Zampini if (c->nz) { 4632ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4633ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4634ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4635ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4636ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4637ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4638ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4639ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4640ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4641ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4642ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4643ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4644ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4645ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4646ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4647ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4648ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4649ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4650ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4651ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4652ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4653ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4654ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4655ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4656ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4657ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4658ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4659ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4660ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4661a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 46621a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4663ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4664ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4665ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4666ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4667ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4668ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4669ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4670ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 46711a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4672ed502f03SStefano Zampini } 4673ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4674ed502f03SStefano Zampini } 4675ed502f03SStefano Zampini } 4676ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4677ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4678ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4679ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4680ed502f03SStefano Zampini PetscFunctionReturn(0); 4681ed502f03SStefano Zampini } 4682c215019aSStefano Zampini 4683c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4684c215019aSStefano Zampini { 4685c215019aSStefano Zampini PetscErrorCode ierr; 4686c215019aSStefano Zampini bool dmem; 4687c215019aSStefano Zampini const PetscScalar *av; 4688c215019aSStefano Zampini cudaError_t cerr; 4689c215019aSStefano Zampini 4690c215019aSStefano Zampini PetscFunctionBegin; 4691c215019aSStefano Zampini dmem = isCudaMem(v); 4692c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4693c215019aSStefano Zampini if (n && idx) { 4694c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4695c215019aSStefano Zampini widx.assign(idx,idx+n); 4696c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4697c215019aSStefano Zampini 4698c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4699c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4700c215019aSStefano Zampini if (dmem) { 4701c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4702c215019aSStefano Zampini } else { 4703c215019aSStefano Zampini w = new THRUSTARRAY(n); 4704c215019aSStefano Zampini dv = w->data(); 4705c215019aSStefano Zampini } 4706c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4707c215019aSStefano Zampini 4708c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4709c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4710c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4711c215019aSStefano Zampini if (w) { 4712c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4713c215019aSStefano Zampini } 4714c215019aSStefano Zampini delete w; 4715c215019aSStefano Zampini } else { 4716c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4717c215019aSStefano Zampini } 4718c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4719c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4720c215019aSStefano Zampini PetscFunctionReturn(0); 4721c215019aSStefano Zampini } 4722