19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 16e8d2b73aSMark Adams 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 769ae82921SPaul Mullowney 777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 827f756511SDominic Meiser 83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 8657181aedSStefano Zampini 877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 897e8381f9SStefano Zampini 90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91c215019aSStefano Zampini 92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93b06137fdSPaul Mullowney { 94b06137fdSPaul Mullowney cusparseStatus_t stat; 95b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96b06137fdSPaul Mullowney 97b06137fdSPaul Mullowney PetscFunctionBegin; 98d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10057d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101b06137fdSPaul Mullowney PetscFunctionReturn(0); 102b06137fdSPaul Mullowney } 103b06137fdSPaul Mullowney 104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105b06137fdSPaul Mullowney { 106b06137fdSPaul Mullowney cusparseStatus_t stat; 107b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108b06137fdSPaul Mullowney 109b06137fdSPaul Mullowney PetscFunctionBegin; 110d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1116b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11216a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11357d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11416a2e217SAlejandro Lamas Daviña } 115b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1166b1cf21dSAlejandro Lamas Daviña } 11757d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118b06137fdSPaul Mullowney PetscFunctionReturn(0); 119b06137fdSPaul Mullowney } 120b06137fdSPaul Mullowney 121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122b06137fdSPaul Mullowney { 123b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1247e8381f9SStefano Zampini PetscBool flg; 1257e8381f9SStefano Zampini PetscErrorCode ierr; 126ccdfe979SStefano Zampini 127b06137fdSPaul Mullowney PetscFunctionBegin; 1287e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1297e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 131b06137fdSPaul Mullowney PetscFunctionReturn(0); 132b06137fdSPaul Mullowney } 133b06137fdSPaul Mullowney 134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1359ae82921SPaul Mullowney { 1369ae82921SPaul Mullowney PetscFunctionBegin; 1379ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1389ae82921SPaul Mullowney PetscFunctionReturn(0); 1399ae82921SPaul Mullowney } 1409ae82921SPaul Mullowney 141c708e6cdSJed Brown /*MC 142087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 148c708e6cdSJed Brown 1499ae82921SPaul Mullowney Level: beginner 150c708e6cdSJed Brown 1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152c708e6cdSJed Brown M*/ 1539ae82921SPaul Mullowney 15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1559ae82921SPaul Mullowney { 1569ae82921SPaul Mullowney PetscErrorCode ierr; 157bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1589ae82921SPaul Mullowney 1599ae82921SPaul Mullowney PetscFunctionBegin; 160bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1622c7c0729SBarry Smith (*B)->factortype = ftype; 1639ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1642205254eSKarl Rupp 165*9c1083e7SRichard Tran Mills if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16733d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168*9c1083e7SRichard Tran Mills if (!A->boundtocpu) { 1699ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1709ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171*9c1083e7SRichard Tran Mills } else { 172*9c1083e7SRichard Tran Mills (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 173*9c1083e7SRichard Tran Mills (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 174*9c1083e7SRichard Tran Mills } 1754ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1764ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 1774ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179*9c1083e7SRichard Tran Mills if (!A->boundtocpu) { 180087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 182*9c1083e7SRichard Tran Mills } else { 183*9c1083e7SRichard Tran Mills (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 184*9c1083e7SRichard Tran Mills (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 185*9c1083e7SRichard Tran Mills } 1864ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 1874ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 1889ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189bc3f50f2SPaul Mullowney 190fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1914ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1923ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1939ae82921SPaul Mullowney PetscFunctionReturn(0); 1949ae82921SPaul Mullowney } 1959ae82921SPaul Mullowney 196bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197ca45077fSPaul Mullowney { 198aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1996e111a19SKarl Rupp 200ca45077fSPaul Mullowney PetscFunctionBegin; 201ca45077fSPaul Mullowney switch (op) { 202e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 203aa372e3fSPaul Mullowney cusparsestruct->format = format; 204ca45077fSPaul Mullowney break; 205e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 206aa372e3fSPaul Mullowney cusparsestruct->format = format; 207ca45077fSPaul Mullowney break; 208ca45077fSPaul Mullowney default: 20936d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210ca45077fSPaul Mullowney } 211ca45077fSPaul Mullowney PetscFunctionReturn(0); 212ca45077fSPaul Mullowney } 2139ae82921SPaul Mullowney 214e057df02SPaul Mullowney /*@ 215e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 217aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 218e057df02SPaul Mullowney Not Collective 219e057df02SPaul Mullowney 220e057df02SPaul Mullowney Input Parameters: 2218468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 22236d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2232692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224e057df02SPaul Mullowney 225e057df02SPaul Mullowney Output Parameter: 226e057df02SPaul Mullowney 227e057df02SPaul Mullowney Level: intermediate 228e057df02SPaul Mullowney 2298468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230e057df02SPaul Mullowney @*/ 231e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232e057df02SPaul Mullowney { 233e057df02SPaul Mullowney PetscErrorCode ierr; 2346e111a19SKarl Rupp 235e057df02SPaul Mullowney PetscFunctionBegin; 236e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238e057df02SPaul Mullowney PetscFunctionReturn(0); 239e057df02SPaul Mullowney } 240e057df02SPaul Mullowney 2411a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 242e6e9a74fSStefano Zampini { 243e6e9a74fSStefano Zampini PetscErrorCode ierr; 244e6e9a74fSStefano Zampini 245e6e9a74fSStefano Zampini PetscFunctionBegin; 2461a2c6b5cSJunchao Zhang switch (op) { 2471a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2481a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2491a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2501a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2511a2c6b5cSJunchao Zhang break; 2521a2c6b5cSJunchao Zhang default: 2531a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2541a2c6b5cSJunchao Zhang break; 255e6e9a74fSStefano Zampini } 256e6e9a74fSStefano Zampini PetscFunctionReturn(0); 257e6e9a74fSStefano Zampini } 258e6e9a74fSStefano Zampini 259bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 260bddcd29dSMark Adams 261bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 262bddcd29dSMark Adams { 263bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 264bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 265bddcd29dSMark Adams PetscBool row_identity,col_identity; 266bddcd29dSMark Adams PetscErrorCode ierr; 267bddcd29dSMark Adams 268bddcd29dSMark Adams PetscFunctionBegin; 269bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 270bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 271bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 272bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 273bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 274bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 275bddcd29dSMark Adams if (row_identity && col_identity) { 276bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 277bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 278bddcd29dSMark Adams B->ops->matsolve = NULL; 279bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 280bddcd29dSMark Adams } else { 281bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 282bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 283bddcd29dSMark Adams B->ops->matsolve = NULL; 284bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 285bddcd29dSMark Adams } 286bddcd29dSMark Adams 287bddcd29dSMark Adams /* get the triangular factors */ 288bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 289bddcd29dSMark Adams PetscFunctionReturn(0); 290bddcd29dSMark Adams } 291bddcd29dSMark Adams 2924416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2939ae82921SPaul Mullowney { 2949ae82921SPaul Mullowney PetscErrorCode ierr; 295e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2969ae82921SPaul Mullowney PetscBool flg; 297a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2986e111a19SKarl Rupp 2999ae82921SPaul Mullowney PetscFunctionBegin; 300e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 3019ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 302e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 303a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 304afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 305afb2bd1cSJunchao Zhang 3064c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 307a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 308afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 309afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 310afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 311afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 312afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 313a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 314a435da06SStefano Zampini if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315a435da06SStefano Zampini #else 316afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 317a435da06SStefano Zampini #endif 318afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 319afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 320afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 321afb2bd1cSJunchao Zhang 322afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 323afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 324afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 325afb2bd1cSJunchao Zhang #endif 3264c87dfd4SPaul Mullowney } 3270af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3289ae82921SPaul Mullowney PetscFunctionReturn(0); 3299ae82921SPaul Mullowney } 3309ae82921SPaul Mullowney 3316fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3329ae82921SPaul Mullowney { 333da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3349ae82921SPaul Mullowney PetscErrorCode ierr; 3359ae82921SPaul Mullowney 3369ae82921SPaul Mullowney PetscFunctionBegin; 337da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3389ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3399ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3409ae82921SPaul Mullowney PetscFunctionReturn(0); 3419ae82921SPaul Mullowney } 3429ae82921SPaul Mullowney 3436fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3449ae82921SPaul Mullowney { 345da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3469ae82921SPaul Mullowney PetscErrorCode ierr; 3479ae82921SPaul Mullowney 3489ae82921SPaul Mullowney PetscFunctionBegin; 349da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3509ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3519ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3529ae82921SPaul Mullowney PetscFunctionReturn(0); 3539ae82921SPaul Mullowney } 3549ae82921SPaul Mullowney 355087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 356087f3262SPaul Mullowney { 357da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 358087f3262SPaul Mullowney PetscErrorCode ierr; 359087f3262SPaul Mullowney 360087f3262SPaul Mullowney PetscFunctionBegin; 361da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 362087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 363087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 364087f3262SPaul Mullowney PetscFunctionReturn(0); 365087f3262SPaul Mullowney } 366087f3262SPaul Mullowney 367087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 368087f3262SPaul Mullowney { 369da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 370087f3262SPaul Mullowney PetscErrorCode ierr; 371087f3262SPaul Mullowney 372087f3262SPaul Mullowney PetscFunctionBegin; 373da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 374087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 375087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 376087f3262SPaul Mullowney PetscFunctionReturn(0); 377087f3262SPaul Mullowney } 378087f3262SPaul Mullowney 379087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3809ae82921SPaul Mullowney { 3819ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3829ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3839ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 384aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3859ae82921SPaul Mullowney cusparseStatus_t stat; 3869ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3879ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3889ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3899ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 390b175d8bbSPaul Mullowney PetscErrorCode ierr; 39157d48284SJunchao Zhang cudaError_t cerr; 3929ae82921SPaul Mullowney 3939ae82921SPaul Mullowney PetscFunctionBegin; 394cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 395c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3969ae82921SPaul Mullowney try { 3979ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3989ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 399da79fbbcSStefano Zampini if (!loTriFactor) { 4002cbc15d9SMark PetscScalar *AALo; 4012cbc15d9SMark 4022cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4039ae82921SPaul Mullowney 4049ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 40557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 40657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 4079ae82921SPaul Mullowney 4089ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 4099ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 4109ae82921SPaul Mullowney AiLo[n] = nzLower; 4119ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4129ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4139ae82921SPaul Mullowney v = aa; 4149ae82921SPaul Mullowney vi = aj; 4159ae82921SPaul Mullowney offset = 1; 4169ae82921SPaul Mullowney rowOffset= 1; 4179ae82921SPaul Mullowney for (i=1; i<n; i++) { 4189ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 419e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4209ae82921SPaul Mullowney AiLo[i] = rowOffset; 4219ae82921SPaul Mullowney rowOffset += nz+1; 4229ae82921SPaul Mullowney 423580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 424580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4259ae82921SPaul Mullowney 4269ae82921SPaul Mullowney offset += nz; 4279ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4289ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4299ae82921SPaul Mullowney offset += 1; 4309ae82921SPaul Mullowney 4319ae82921SPaul Mullowney v += nz; 4329ae82921SPaul Mullowney vi += nz; 4339ae82921SPaul Mullowney } 4342205254eSKarl Rupp 435aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 436da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 437da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 438aa372e3fSPaul Mullowney /* Create the matrix description */ 43957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 44057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 442afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 443afb2bd1cSJunchao Zhang #else 44457d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 445afb2bd1cSJunchao Zhang #endif 44657d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 44757d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 448aa372e3fSPaul Mullowney 449aa372e3fSPaul Mullowney /* set the operation */ 450aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 451aa372e3fSPaul Mullowney 452aa372e3fSPaul Mullowney /* set the matrix */ 453aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 454aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 455aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 456aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 457aa372e3fSPaul Mullowney 458aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 459aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 460aa372e3fSPaul Mullowney 461aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 462aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 463aa372e3fSPaul Mullowney 464aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 465aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 466aa372e3fSPaul Mullowney 467afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 468da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 469afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 472afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 475afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 476afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 477afb2bd1cSJunchao Zhang #endif 478afb2bd1cSJunchao Zhang 479aa372e3fSPaul Mullowney /* perform the solve analysis */ 480aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 481aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 482aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 483d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 4841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 485d49cd2b7SBarry Smith loTriFactor->solveInfo, 486d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 487d49cd2b7SBarry Smith #else 488d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 489afb2bd1cSJunchao Zhang #endif 490da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 491da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 492aa372e3fSPaul Mullowney 493da79fbbcSStefano Zampini /* assign the pointer */ 494aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4952cbc15d9SMark loTriFactor->AA_h = AALo; 49657d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 49757d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4984863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 499da79fbbcSStefano Zampini } else { /* update values only */ 5002cbc15d9SMark if (!loTriFactor->AA_h) { 5012cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 5022cbc15d9SMark } 503da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 5042cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 505da79fbbcSStefano Zampini v = aa; 506da79fbbcSStefano Zampini vi = aj; 507da79fbbcSStefano Zampini offset = 1; 508da79fbbcSStefano Zampini for (i=1; i<n; i++) { 509da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 5102cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 511da79fbbcSStefano Zampini offset += nz; 5122cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 513da79fbbcSStefano Zampini offset += 1; 514da79fbbcSStefano Zampini v += nz; 515da79fbbcSStefano Zampini } 5162cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 517da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 518da79fbbcSStefano Zampini } 5199ae82921SPaul Mullowney } catch(char *ex) { 5209ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5219ae82921SPaul Mullowney } 5229ae82921SPaul Mullowney } 5239ae82921SPaul Mullowney PetscFunctionReturn(0); 5249ae82921SPaul Mullowney } 5259ae82921SPaul Mullowney 526087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5279ae82921SPaul Mullowney { 5289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5299ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5309ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 531aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5329ae82921SPaul Mullowney cusparseStatus_t stat; 5339ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5349ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5359ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5369ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5379ae82921SPaul Mullowney PetscErrorCode ierr; 53857d48284SJunchao Zhang cudaError_t cerr; 5399ae82921SPaul Mullowney 5409ae82921SPaul Mullowney PetscFunctionBegin; 541cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 542c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5439ae82921SPaul Mullowney try { 5449ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5459ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 546da79fbbcSStefano Zampini if (!upTriFactor) { 5472cbc15d9SMark PetscScalar *AAUp; 5482cbc15d9SMark 5492cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5502cbc15d9SMark 5519ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 55257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 55357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5549ae82921SPaul Mullowney 5559ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5569ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5579ae82921SPaul Mullowney AiUp[n]=nzUpper; 5589ae82921SPaul Mullowney offset = nzUpper; 5599ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5609ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5619ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5629ae82921SPaul Mullowney 563e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5649ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5659ae82921SPaul Mullowney 566e057df02SPaul Mullowney /* decrement the offset */ 5679ae82921SPaul Mullowney offset -= (nz+1); 5689ae82921SPaul Mullowney 569e057df02SPaul Mullowney /* first, set the diagonal elements */ 5709ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 57109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5729ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5739ae82921SPaul Mullowney 574580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 575580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5769ae82921SPaul Mullowney } 5772205254eSKarl Rupp 578aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 579da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 580da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5812205254eSKarl Rupp 582aa372e3fSPaul Mullowney /* Create the matrix description */ 58357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 58457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 586afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 587afb2bd1cSJunchao Zhang #else 58857d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 589afb2bd1cSJunchao Zhang #endif 59057d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 59157d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 592aa372e3fSPaul Mullowney 593aa372e3fSPaul Mullowney /* set the operation */ 594aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 595aa372e3fSPaul Mullowney 596aa372e3fSPaul Mullowney /* set the matrix */ 597aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 598aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 599aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 600aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 601aa372e3fSPaul Mullowney 602aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 603aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 604aa372e3fSPaul Mullowney 605aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 606aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 607aa372e3fSPaul Mullowney 608aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 609aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 610aa372e3fSPaul Mullowney 611afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 612da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 613afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 6141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 616afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 617afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 618afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 619afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 620afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 621afb2bd1cSJunchao Zhang #endif 622afb2bd1cSJunchao Zhang 623aa372e3fSPaul Mullowney /* perform the solve analysis */ 624aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 625aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 626aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 627d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 629d49cd2b7SBarry Smith upTriFactor->solveInfo, 630d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 631d49cd2b7SBarry Smith #else 632d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 633afb2bd1cSJunchao Zhang #endif 634da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 635da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 636aa372e3fSPaul Mullowney 637da79fbbcSStefano Zampini /* assign the pointer */ 638aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6392cbc15d9SMark upTriFactor->AA_h = AAUp; 64057d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 64157d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6424863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 643da79fbbcSStefano Zampini } else { 6442cbc15d9SMark if (!upTriFactor->AA_h) { 6452cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6462cbc15d9SMark } 647da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 648da79fbbcSStefano Zampini offset = nzUpper; 649da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 650da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 651da79fbbcSStefano Zampini 652da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 653da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 654da79fbbcSStefano Zampini 655da79fbbcSStefano Zampini /* decrement the offset */ 656da79fbbcSStefano Zampini offset -= (nz+1); 657da79fbbcSStefano Zampini 658da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6592cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6602cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 661da79fbbcSStefano Zampini } 6622cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 663da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 664da79fbbcSStefano Zampini } 6659ae82921SPaul Mullowney } catch(char *ex) { 6669ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6679ae82921SPaul Mullowney } 6689ae82921SPaul Mullowney } 6699ae82921SPaul Mullowney PetscFunctionReturn(0); 6709ae82921SPaul Mullowney } 6719ae82921SPaul Mullowney 672087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6739ae82921SPaul Mullowney { 6749ae82921SPaul Mullowney PetscErrorCode ierr; 6759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6769ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6779ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6789ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6799ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6809ae82921SPaul Mullowney 6819ae82921SPaul Mullowney PetscFunctionBegin; 682da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 683087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 684087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6852205254eSKarl Rupp 686da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 687aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6889ae82921SPaul Mullowney 689c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 690e057df02SPaul Mullowney /* lower triangular indices */ 6919ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 692da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 693da79fbbcSStefano Zampini const PetscInt *r; 694da79fbbcSStefano Zampini 695da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 696aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 697aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6989ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 699da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 700da79fbbcSStefano Zampini } 7019ae82921SPaul Mullowney 702e057df02SPaul Mullowney /* upper triangular indices */ 7039ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 704da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 705da79fbbcSStefano Zampini const PetscInt *c; 706da79fbbcSStefano Zampini 707da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 708aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 709aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 7109ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 711da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 712da79fbbcSStefano Zampini } 7139ae82921SPaul Mullowney PetscFunctionReturn(0); 7149ae82921SPaul Mullowney } 7159ae82921SPaul Mullowney 716087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 717087f3262SPaul Mullowney { 718087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 719087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 720aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 721aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 722087f3262SPaul Mullowney cusparseStatus_t stat; 723087f3262SPaul Mullowney PetscErrorCode ierr; 72457d48284SJunchao Zhang cudaError_t cerr; 725087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 726087f3262SPaul Mullowney PetscScalar *AAUp; 727087f3262SPaul Mullowney PetscScalar *AALo; 728087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 729087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 730087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 731087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 732087f3262SPaul Mullowney 733087f3262SPaul Mullowney PetscFunctionBegin; 734cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 735c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 736087f3262SPaul Mullowney try { 737da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 738da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 739da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 740087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 74157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 74257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 743087f3262SPaul Mullowney 744087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 745087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 746087f3262SPaul Mullowney AiUp[n]=nzUpper; 747087f3262SPaul Mullowney offset = 0; 748087f3262SPaul Mullowney for (i=0; i<n; i++) { 749087f3262SPaul Mullowney /* set the pointers */ 750087f3262SPaul Mullowney v = aa + ai[i]; 751087f3262SPaul Mullowney vj = aj + ai[i]; 752087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 753087f3262SPaul Mullowney 754087f3262SPaul Mullowney /* first, set the diagonal elements */ 755087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 75609f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 757087f3262SPaul Mullowney AiUp[i] = offset; 75809f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 759087f3262SPaul Mullowney 760087f3262SPaul Mullowney offset+=1; 761087f3262SPaul Mullowney if (nz>0) { 762f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 763580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 764087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 765087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 766087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 767087f3262SPaul Mullowney } 768087f3262SPaul Mullowney offset+=nz; 769087f3262SPaul Mullowney } 770087f3262SPaul Mullowney } 771087f3262SPaul Mullowney 772aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 773da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 774da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 775087f3262SPaul Mullowney 776aa372e3fSPaul Mullowney /* Create the matrix description */ 77757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 77857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 780afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 781afb2bd1cSJunchao Zhang #else 78257d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 783afb2bd1cSJunchao Zhang #endif 78457d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 78557d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 786087f3262SPaul Mullowney 787aa372e3fSPaul Mullowney /* set the matrix */ 788aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 789aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 790aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 791aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 792aa372e3fSPaul Mullowney 793aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 794aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 795aa372e3fSPaul Mullowney 796aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 797aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 798aa372e3fSPaul Mullowney 799aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 800aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 801aa372e3fSPaul Mullowney 802afb2bd1cSJunchao Zhang /* set the operation */ 803afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 804afb2bd1cSJunchao Zhang 805afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 806da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 807afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 810afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 811afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 812afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 813afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 814afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 815afb2bd1cSJunchao Zhang #endif 816afb2bd1cSJunchao Zhang 817aa372e3fSPaul Mullowney /* perform the solve analysis */ 818aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 819aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 820aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 821d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823d49cd2b7SBarry Smith upTriFactor->solveInfo, 824d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 825d49cd2b7SBarry Smith #else 826d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 827afb2bd1cSJunchao Zhang #endif 828da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 829da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 830aa372e3fSPaul Mullowney 831da79fbbcSStefano Zampini /* assign the pointer */ 832aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 833aa372e3fSPaul Mullowney 834aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 835da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 836da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 837aa372e3fSPaul Mullowney 838aa372e3fSPaul Mullowney /* Create the matrix description */ 83957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 84057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 842afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 843afb2bd1cSJunchao Zhang #else 84457d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 845afb2bd1cSJunchao Zhang #endif 84657d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 84757d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 848aa372e3fSPaul Mullowney 849aa372e3fSPaul Mullowney /* set the operation */ 850aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 851aa372e3fSPaul Mullowney 852aa372e3fSPaul Mullowney /* set the matrix */ 853aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 854aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 855aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 856aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 857aa372e3fSPaul Mullowney 858aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 859aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 860aa372e3fSPaul Mullowney 861aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 862aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 863aa372e3fSPaul Mullowney 864aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 865aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 866aa372e3fSPaul Mullowney 867afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 868da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 869afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 872afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 873afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 874afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 875afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 876afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 877afb2bd1cSJunchao Zhang #endif 878afb2bd1cSJunchao Zhang 879aa372e3fSPaul Mullowney /* perform the solve analysis */ 880aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 881aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 882aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 883d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 8841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 885d49cd2b7SBarry Smith loTriFactor->solveInfo, 886d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 887d49cd2b7SBarry Smith #else 888d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 889afb2bd1cSJunchao Zhang #endif 890da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 891da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 892aa372e3fSPaul Mullowney 893da79fbbcSStefano Zampini /* assign the pointer */ 894aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 895087f3262SPaul Mullowney 896da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 89757d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 89857d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 899da79fbbcSStefano Zampini } else { 900da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 901da79fbbcSStefano Zampini offset = 0; 902da79fbbcSStefano Zampini for (i=0; i<n; i++) { 903da79fbbcSStefano Zampini /* set the pointers */ 904da79fbbcSStefano Zampini v = aa + ai[i]; 905da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 906da79fbbcSStefano Zampini 907da79fbbcSStefano Zampini /* first, set the diagonal elements */ 908da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 909da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 910da79fbbcSStefano Zampini 911da79fbbcSStefano Zampini offset+=1; 912da79fbbcSStefano Zampini if (nz>0) { 913da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 914da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 915da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 916da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 917da79fbbcSStefano Zampini } 918da79fbbcSStefano Zampini offset+=nz; 919da79fbbcSStefano Zampini } 920da79fbbcSStefano Zampini } 921da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 922da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 923da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 924da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 925da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 926da79fbbcSStefano Zampini } 92757d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 92857d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 929087f3262SPaul Mullowney } catch(char *ex) { 930087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 931087f3262SPaul Mullowney } 932087f3262SPaul Mullowney } 933087f3262SPaul Mullowney PetscFunctionReturn(0); 934087f3262SPaul Mullowney } 935087f3262SPaul Mullowney 936087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9379ae82921SPaul Mullowney { 9389ae82921SPaul Mullowney PetscErrorCode ierr; 939087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 940087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 941087f3262SPaul Mullowney IS ip = a->row; 942087f3262SPaul Mullowney PetscBool perm_identity; 943087f3262SPaul Mullowney PetscInt n = A->rmap->n; 944087f3262SPaul Mullowney 945087f3262SPaul Mullowney PetscFunctionBegin; 946da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 947087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 948da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 949aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 950aa372e3fSPaul Mullowney 951da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 952da79fbbcSStefano Zampini 953087f3262SPaul Mullowney /* lower triangular indices */ 954087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 955087f3262SPaul Mullowney if (!perm_identity) { 9564e4bbfaaSStefano Zampini IS iip; 957da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9584e4bbfaaSStefano Zampini 9594e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9604e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 961da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 962aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 963aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 964aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9654e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9664e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9674e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 968087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 969da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 970da79fbbcSStefano Zampini } 971087f3262SPaul Mullowney PetscFunctionReturn(0); 972087f3262SPaul Mullowney } 973087f3262SPaul Mullowney 974087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 975087f3262SPaul Mullowney { 976087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 977087f3262SPaul Mullowney IS ip = b->row; 978087f3262SPaul Mullowney PetscBool perm_identity; 979b175d8bbSPaul Mullowney PetscErrorCode ierr; 980087f3262SPaul Mullowney 981087f3262SPaul Mullowney PetscFunctionBegin; 98257181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 983087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 984ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 985087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 986087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 987087f3262SPaul Mullowney if (perm_identity) { 988087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 989087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9904e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9914e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 992087f3262SPaul Mullowney } else { 993087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 994087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9954e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9964e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 997087f3262SPaul Mullowney } 998087f3262SPaul Mullowney 999087f3262SPaul Mullowney /* get the triangular factors */ 1000087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1001087f3262SPaul Mullowney PetscFunctionReturn(0); 1002087f3262SPaul Mullowney } 10039ae82921SPaul Mullowney 1004b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1005bda325fcSPaul Mullowney { 1006bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1007aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1008aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1009da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1010da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1011bda325fcSPaul Mullowney cusparseStatus_t stat; 1012aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1013aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1014aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1015aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10161b0a6780SStefano Zampini cudaError_t cerr; 1017da79fbbcSStefano Zampini PetscErrorCode ierr; 1018b175d8bbSPaul Mullowney 1019bda325fcSPaul Mullowney PetscFunctionBegin; 1020aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1021da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1022da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1023aa372e3fSPaul Mullowney 1024aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1025aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1026aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1027aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1028aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1029aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1030aa372e3fSPaul Mullowney 1031aa372e3fSPaul Mullowney /* Create the matrix description */ 103257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 103357d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 103457d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 103557d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 103657d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1037aa372e3fSPaul Mullowney 1038aa372e3fSPaul Mullowney /* set the operation */ 1039aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1040aa372e3fSPaul Mullowney 1041aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1042aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1043afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1044afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1045aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1046afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1047afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1048afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1049aa372e3fSPaul Mullowney 1050aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1051afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1053afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1054afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1055afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1056afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1057afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1058afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1060afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10611b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1062afb2bd1cSJunchao Zhang #endif 1063afb2bd1cSJunchao Zhang 1064da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1065aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1066aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1067aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1068aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1069aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1070aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1071afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1072afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1073afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1074d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1075afb2bd1cSJunchao Zhang #else 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1078afb2bd1cSJunchao Zhang #endif 1079da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1080da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1081aa372e3fSPaul Mullowney 1082afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1083da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1084afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1087afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1088afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1089afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1090afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1091afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1092afb2bd1cSJunchao Zhang #endif 1093afb2bd1cSJunchao Zhang 1094afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1095aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1097afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1098d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 10991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1100d49cd2b7SBarry Smith loTriFactorT->solveInfo, 1101d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1102d49cd2b7SBarry Smith #else 1103d49cd2b7SBarry Smith loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1104afb2bd1cSJunchao Zhang #endif 1105da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1106da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1107aa372e3fSPaul Mullowney 1108da79fbbcSStefano Zampini /* assign the pointer */ 1109aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1110aa372e3fSPaul Mullowney 1111aa372e3fSPaul Mullowney /*********************************************/ 1112aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1113aa372e3fSPaul Mullowney /*********************************************/ 1114aa372e3fSPaul Mullowney 1115aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1116da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1117da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1118aa372e3fSPaul Mullowney 1119aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1120aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1121aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1122aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1123aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1124aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1125aa372e3fSPaul Mullowney 1126aa372e3fSPaul Mullowney /* Create the matrix description */ 112757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 112857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 112957d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 113057d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 113157d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1132aa372e3fSPaul Mullowney 1133aa372e3fSPaul Mullowney /* set the operation */ 1134aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1135aa372e3fSPaul Mullowney 1136aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1137aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1138afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1139afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1140aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1141afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1142afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1143afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1144aa372e3fSPaul Mullowney 1145aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1146afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1148afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1149afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1150afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1151afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1152afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1153afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1155afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1156afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1157afb2bd1cSJunchao Zhang #endif 1158afb2bd1cSJunchao Zhang 1159da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1160aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1161aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1162aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1163aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1164aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1165aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1166afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1167afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1168afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1169d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1170afb2bd1cSJunchao Zhang #else 1171afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1173afb2bd1cSJunchao Zhang #endif 1174d49cd2b7SBarry Smith 1175da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1176da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1177aa372e3fSPaul Mullowney 1178afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1179da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1180afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1183afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1184afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1185afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1186afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1187afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1188afb2bd1cSJunchao Zhang #endif 1189afb2bd1cSJunchao Zhang 1190afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1191aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1192afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1193afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1194d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 11951b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1196d49cd2b7SBarry Smith upTriFactorT->solveInfo, 1197d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1198d49cd2b7SBarry Smith #else 1199d49cd2b7SBarry Smith upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1200afb2bd1cSJunchao Zhang #endif 1201d49cd2b7SBarry Smith 1202da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1203da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1204aa372e3fSPaul Mullowney 1205da79fbbcSStefano Zampini /* assign the pointer */ 1206aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1207bda325fcSPaul Mullowney PetscFunctionReturn(0); 1208bda325fcSPaul Mullowney } 1209bda325fcSPaul Mullowney 1210a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1211a49f1ed0SStefano Zampini { 1212a49f1ed0SStefano Zampini __host__ __device__ 1213a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1214a49f1ed0SStefano Zampini { 1215a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1216a49f1ed0SStefano Zampini } 1217a49f1ed0SStefano Zampini }; 1218a49f1ed0SStefano Zampini 12193606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1220bda325fcSPaul Mullowney { 1221aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1222a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1223bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1224bda325fcSPaul Mullowney cusparseStatus_t stat; 1225aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1226b06137fdSPaul Mullowney cudaError_t err; 122785ba7357SStefano Zampini PetscErrorCode ierr; 1228b175d8bbSPaul Mullowney 1229bda325fcSPaul Mullowney PetscFunctionBegin; 1230a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1231a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1232e8d2b73aSMark Adams if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1233a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1234e8d2b73aSMark Adams if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12351a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 123685ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1237ee7b52eaSHong Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1238a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1239a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1240a49f1ed0SStefano Zampini } 1241a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1242aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 124357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1244aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 124557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 124657d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1247aa372e3fSPaul Mullowney 1248b06137fdSPaul Mullowney /* set alpha and beta */ 1249afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12507656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12517656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1252afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12537656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12547656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1255b06137fdSPaul Mullowney 1256aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1257aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1258a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1259554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1260554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1261aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1262a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1263aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1264aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1265a3fdcf43SKarl Rupp 1266039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 126781902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1268afb2bd1cSJunchao Zhang 1269afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 12703606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1271afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1272afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1273afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1274afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1275afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1276afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 12773606e59fSJunchao Zhang #else 12783606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12793606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12803606e59fSJunchao Zhang 12813606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12823606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12833606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12843606e59fSJunchao Zhang */ 12853606e59fSJunchao Zhang if (matrixT->num_entries) { 12863606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 12873606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 12883606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 12893606e59fSJunchao Zhang matrixT->values->data().get(), 12903606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 12913606e59fSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 12923606e59fSJunchao Zhang 12933606e59fSJunchao Zhang } else { 12943606e59fSJunchao Zhang matstructT->matDescr = NULL; 12953606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12963606e59fSJunchao Zhang } 12973606e59fSJunchao Zhang #endif 1298afb2bd1cSJunchao Zhang #endif 1299aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1300afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1301afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1302afb2bd1cSJunchao Zhang #else 1303aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 130451c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 130551c6d536SStefano Zampini /* First convert HYB to CSR */ 1306aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1307aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1308aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1309aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1310aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1311aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1312aa372e3fSPaul Mullowney 1313aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1314aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1315aa372e3fSPaul Mullowney temp->values->data().get(), 1316aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 131757d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1318aa372e3fSPaul Mullowney 1319aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1320aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1321aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1322aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1323aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1324aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1325aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1326aa372e3fSPaul Mullowney 1327aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1328aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1329aa372e3fSPaul Mullowney temp->values->data().get(), 1330aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1331aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1332aa372e3fSPaul Mullowney tempT->values->data().get(), 1333aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1334aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 133557d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1336aa372e3fSPaul Mullowney 1337aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1338aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 133957d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1340aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1341aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1342aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1343aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1344aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1345aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 134657d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1347aa372e3fSPaul Mullowney 1348aa372e3fSPaul Mullowney /* assign the pointer */ 1349aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13501a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1351aa372e3fSPaul Mullowney /* delete temporaries */ 1352aa372e3fSPaul Mullowney if (tempT) { 1353aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1354aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1355aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1356aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1357087f3262SPaul Mullowney } 1358aa372e3fSPaul Mullowney if (temp) { 1359aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1360aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1361aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1362aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1363aa372e3fSPaul Mullowney } 1364afb2bd1cSJunchao Zhang #endif 1365aa372e3fSPaul Mullowney } 1366a49f1ed0SStefano Zampini } 1367a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1368a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1369a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1370e8d2b73aSMark Adams if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1371e8d2b73aSMark Adams if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1372e8d2b73aSMark Adams if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1373e8d2b73aSMark Adams if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1374e8d2b73aSMark Adams if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1375e8d2b73aSMark Adams if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1376e8d2b73aSMark Adams if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1377e8d2b73aSMark Adams if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1378a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1379a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1380a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1381a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1382a49f1ed0SStefano Zampini } 1383a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1384a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1385a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1386a49f1ed0SStefano Zampini 1387a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1388a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389a49f1ed0SStefano Zampini void *csr2cscBuffer; 1390a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1391a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1392a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1393a49f1ed0SStefano Zampini matrix->values->data().get(), 1394a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1395a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1396a49f1ed0SStefano Zampini matrixT->values->data().get(), 1397a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1399a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1400a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1401a49f1ed0SStefano Zampini #endif 1402a49f1ed0SStefano Zampini 14031a2c6b5cSJunchao Zhang if (matrix->num_entries) { 14041a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 14051a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 14061a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 14071a2c6b5cSJunchao Zhang 14081a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 14091a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 14101a2c6b5cSJunchao Zhang */ 14111a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 14121a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 14131a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 14141a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 14151a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1416a49f1ed0SStefano Zampini matrixT->values->data().get(), 1417a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1418a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1419a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 14201a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1421a49f1ed0SStefano Zampini #else 1422a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14231a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1424a49f1ed0SStefano Zampini #endif 14251a2c6b5cSJunchao Zhang } else { 14261a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14271a2c6b5cSJunchao Zhang } 14281a2c6b5cSJunchao Zhang 1429a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1430a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1431a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1432a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1433a49f1ed0SStefano Zampini #endif 1434a49f1ed0SStefano Zampini } 1435a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1436a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1437a49f1ed0SStefano Zampini matrixT->values->begin())); 1438a49f1ed0SStefano Zampini } 1439ee7b52eaSHong Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 144085ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1441213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1442213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1443aa372e3fSPaul Mullowney /* assign the pointer */ 1444aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14451a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1446bda325fcSPaul Mullowney PetscFunctionReturn(0); 1447bda325fcSPaul Mullowney } 1448bda325fcSPaul Mullowney 1449a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14506fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1451bda325fcSPaul Mullowney { 1452c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1453465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1454465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1455465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1456465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1457bda325fcSPaul Mullowney cusparseStatus_t stat; 1458bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1459aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1462b175d8bbSPaul Mullowney PetscErrorCode ierr; 1463bda325fcSPaul Mullowney 1464bda325fcSPaul Mullowney PetscFunctionBegin; 1465aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1466aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1467bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1468aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1469aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1470bda325fcSPaul Mullowney } 1471bda325fcSPaul Mullowney 1472bda325fcSPaul Mullowney /* Get the GPU pointers */ 1473c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1474c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1475c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1476c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1477bda325fcSPaul Mullowney 14787a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1479aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1480a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1481c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1482c41cb2e2SAlejandro Lamas Daviña xGPU); 1483aa372e3fSPaul Mullowney 1484aa372e3fSPaul Mullowney /* First, solve U */ 1485aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1486afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14871b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1488afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1489afb2bd1cSJunchao Zhang #endif 1490afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1491aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1492aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1493aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1494aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1495d49cd2b7SBarry Smith xarray, 14961b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1497d49cd2b7SBarry Smith tempGPU->data().get(), 1498d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1499d49cd2b7SBarry Smith #else 1500d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1501afb2bd1cSJunchao Zhang #endif 1502aa372e3fSPaul Mullowney 1503aa372e3fSPaul Mullowney /* Then, solve L */ 1504aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1505afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15061b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1507afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1508afb2bd1cSJunchao Zhang #endif 1509afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1510aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1511aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1512aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1513aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1514d49cd2b7SBarry Smith tempGPU->data().get(), 15151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1516d49cd2b7SBarry Smith xarray, 1517d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1518d49cd2b7SBarry Smith #else 1519d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1520afb2bd1cSJunchao Zhang #endif 1521aa372e3fSPaul Mullowney 1522aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1523a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1524c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1525aa372e3fSPaul Mullowney tempGPU->begin()); 1526aa372e3fSPaul Mullowney 1527aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1528a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1529bda325fcSPaul Mullowney 1530bda325fcSPaul Mullowney /* restore */ 1531c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1532c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1533661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1534958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1535bda325fcSPaul Mullowney PetscFunctionReturn(0); 1536bda325fcSPaul Mullowney } 1537bda325fcSPaul Mullowney 15386fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1539bda325fcSPaul Mullowney { 1540465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1541465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1542bda325fcSPaul Mullowney cusparseStatus_t stat; 1543bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1544aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1545aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1546aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1547b175d8bbSPaul Mullowney PetscErrorCode ierr; 1548bda325fcSPaul Mullowney 1549bda325fcSPaul Mullowney PetscFunctionBegin; 1550aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1551aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1552bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1553aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1554aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1555bda325fcSPaul Mullowney } 1556bda325fcSPaul Mullowney 1557bda325fcSPaul Mullowney /* Get the GPU pointers */ 1558c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1559c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1560bda325fcSPaul Mullowney 15617a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1562aa372e3fSPaul Mullowney /* First, solve U */ 1563aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1564afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1566afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1567afb2bd1cSJunchao Zhang #endif 1568afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1569aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1570aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1571aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1572aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1573d49cd2b7SBarry Smith barray, 15741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1575d49cd2b7SBarry Smith tempGPU->data().get(), 1576d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1577d49cd2b7SBarry Smith #else 1578d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1579afb2bd1cSJunchao Zhang #endif 1580aa372e3fSPaul Mullowney 1581aa372e3fSPaul Mullowney /* Then, solve L */ 1582aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1583afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1585afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1586afb2bd1cSJunchao Zhang #endif 1587afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1588aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1589aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1590aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1591aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1592d49cd2b7SBarry Smith tempGPU->data().get(), 15931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1594d49cd2b7SBarry Smith xarray, 1595d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1596d49cd2b7SBarry Smith #else 1597d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1598afb2bd1cSJunchao Zhang #endif 1599bda325fcSPaul Mullowney 1600bda325fcSPaul Mullowney /* restore */ 1601c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1602c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1603661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1604958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1605bda325fcSPaul Mullowney PetscFunctionReturn(0); 1606bda325fcSPaul Mullowney } 1607bda325fcSPaul Mullowney 16086fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 16099ae82921SPaul Mullowney { 1610465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1611465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1612465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1613465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16149ae82921SPaul Mullowney cusparseStatus_t stat; 16159ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1616aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1617aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1618aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1619b175d8bbSPaul Mullowney PetscErrorCode ierr; 16209ae82921SPaul Mullowney 16219ae82921SPaul Mullowney PetscFunctionBegin; 1622ebc8f436SDominic Meiser 1623e057df02SPaul Mullowney /* Get the GPU pointers */ 1624c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1625c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1626c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1627c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16289ae82921SPaul Mullowney 16297a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1630aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1631a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1632c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16334e4bbfaaSStefano Zampini tempGPU->begin()); 1634aa372e3fSPaul Mullowney 1635aa372e3fSPaul Mullowney /* Next, solve L */ 1636aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1637afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1639afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1640afb2bd1cSJunchao Zhang #endif 1641afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1642aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1643aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1644aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1645aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1646d49cd2b7SBarry Smith tempGPU->data().get(), 16471b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1648d49cd2b7SBarry Smith xarray, 1649d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1650d49cd2b7SBarry Smith #else 1651d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1652afb2bd1cSJunchao Zhang #endif 1653aa372e3fSPaul Mullowney 1654aa372e3fSPaul Mullowney /* Then, solve U */ 1655aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1656afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16571b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1658afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1659afb2bd1cSJunchao Zhang #endif 1660afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1661aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1662aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1663aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1664d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 16651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1666d49cd2b7SBarry Smith tempGPU->data().get(), 1667d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1668d49cd2b7SBarry Smith #else 1669d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1670afb2bd1cSJunchao Zhang #endif 1671d49cd2b7SBarry Smith 16724e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1673a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16744e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16754e4bbfaaSStefano Zampini xGPU); 16769ae82921SPaul Mullowney 1677c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1678c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1679661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1680958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16819ae82921SPaul Mullowney PetscFunctionReturn(0); 16829ae82921SPaul Mullowney } 16839ae82921SPaul Mullowney 16846fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16859ae82921SPaul Mullowney { 1686465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1687465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16889ae82921SPaul Mullowney cusparseStatus_t stat; 16899ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1690aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1691aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1692aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1693b175d8bbSPaul Mullowney PetscErrorCode ierr; 16949ae82921SPaul Mullowney 16959ae82921SPaul Mullowney PetscFunctionBegin; 1696e057df02SPaul Mullowney /* Get the GPU pointers */ 1697c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1698c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 16999ae82921SPaul Mullowney 17007a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1701aa372e3fSPaul Mullowney /* First, solve L */ 1702aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1703afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 17041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1705afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1706afb2bd1cSJunchao Zhang #endif 1707afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1708aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1709aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1710aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1711aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1712d49cd2b7SBarry Smith barray, 17131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1714d49cd2b7SBarry Smith tempGPU->data().get(), 1715d49cd2b7SBarry Smith loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1716d49cd2b7SBarry Smith #else 1717d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1718afb2bd1cSJunchao Zhang #endif 1719d49cd2b7SBarry Smith 1720aa372e3fSPaul Mullowney /* Next, solve U */ 1721aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1722afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1724afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1725afb2bd1cSJunchao Zhang #endif 1726afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1727aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1728aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1729aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1730aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1731d49cd2b7SBarry Smith tempGPU->data().get(), 17321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1733d49cd2b7SBarry Smith xarray, 1734d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1735d49cd2b7SBarry Smith #else 1736d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1737afb2bd1cSJunchao Zhang #endif 17389ae82921SPaul Mullowney 1739c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1740c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1741661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1742958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17439ae82921SPaul Mullowney PetscFunctionReturn(0); 17449ae82921SPaul Mullowney } 17459ae82921SPaul Mullowney 17467e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17477e8381f9SStefano Zampini { 17487e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17497e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17507e8381f9SStefano Zampini cudaError_t cerr; 17517e8381f9SStefano Zampini PetscErrorCode ierr; 17527e8381f9SStefano Zampini 17537e8381f9SStefano Zampini PetscFunctionBegin; 17547e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17557e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17567e8381f9SStefano Zampini 17577e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17587e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 17597e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 17607e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 17617e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17627e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17637e8381f9SStefano Zampini } 17647e8381f9SStefano Zampini PetscFunctionReturn(0); 17657e8381f9SStefano Zampini } 17667e8381f9SStefano Zampini 17677e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17687e8381f9SStefano Zampini { 17697e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17707e8381f9SStefano Zampini PetscErrorCode ierr; 17717e8381f9SStefano Zampini 17727e8381f9SStefano Zampini PetscFunctionBegin; 17737e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 17747e8381f9SStefano Zampini *array = a->a; 17757e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 17767e8381f9SStefano Zampini PetscFunctionReturn(0); 17777e8381f9SStefano Zampini } 17787e8381f9SStefano Zampini 1779042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 17809ae82921SPaul Mullowney { 1781aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 17827c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 17839ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1784213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 17859ae82921SPaul Mullowney PetscErrorCode ierr; 1786aa372e3fSPaul Mullowney cusparseStatus_t stat; 1787abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1788b06137fdSPaul Mullowney cudaError_t err; 17899ae82921SPaul Mullowney 17909ae82921SPaul Mullowney PetscFunctionBegin; 1791e8d2b73aSMark Adams if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1792c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1793a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1794a49f1ed0SStefano Zampini CsrMatrix *matrix; 1795afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 179685ba7357SStefano Zampini 1797e8d2b73aSMark Adams if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 179885ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1799afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 180005035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 18014863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 180285ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1803a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 180434d6c7a5SJose E. Roman } else { 1805abb89eb1SStefano Zampini PetscInt nnz; 180685ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 18077c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1808a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 18097c700b8dSJunchao Zhang delete cusparsestruct->workVector; 181081902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1811a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1812a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18139ae82921SPaul Mullowney try { 18149ae82921SPaul Mullowney if (a->compressedrow.use) { 18159ae82921SPaul Mullowney m = a->compressedrow.nrows; 18169ae82921SPaul Mullowney ii = a->compressedrow.i; 18179ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 18189ae82921SPaul Mullowney } else { 1819213423ffSJunchao Zhang m = A->rmap->n; 1820213423ffSJunchao Zhang ii = a->i; 1821e6e9a74fSStefano Zampini ridx = NULL; 18229ae82921SPaul Mullowney } 1823e8d2b73aSMark Adams if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1824e8d2b73aSMark Adams if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1825abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1826abb89eb1SStefano Zampini else nnz = a->nz; 18279ae82921SPaul Mullowney 182885ba7357SStefano Zampini /* create cusparse matrix */ 1829abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1830aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 183157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 183257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 183357d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 18349ae82921SPaul Mullowney 1835afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 18367656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 18377656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1838afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 18397656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 18407656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 184157d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1842b06137fdSPaul Mullowney 1843aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1844aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1845aa372e3fSPaul Mullowney /* set the matrix */ 1846afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1847afb2bd1cSJunchao Zhang mat->num_rows = m; 1848afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1849abb89eb1SStefano Zampini mat->num_entries = nnz; 1850afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1851afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18529ae82921SPaul Mullowney 1853abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1854abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1855aa372e3fSPaul Mullowney 1856abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1857abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1858aa372e3fSPaul Mullowney 1859aa372e3fSPaul Mullowney /* assign the pointer */ 1860afb2bd1cSJunchao Zhang matstruct->mat = mat; 1861afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1862afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1863afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1864afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1865afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1866afb2bd1cSJunchao Zhang mat->values->data().get(), 1867afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1868afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1869afb2bd1cSJunchao Zhang } 1870afb2bd1cSJunchao Zhang #endif 1871aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1872afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1873afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1874afb2bd1cSJunchao Zhang #else 1875afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1876afb2bd1cSJunchao Zhang mat->num_rows = m; 1877afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1878abb89eb1SStefano Zampini mat->num_entries = nnz; 1879afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1880afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1881aa372e3fSPaul Mullowney 1882abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1883abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1884aa372e3fSPaul Mullowney 1885abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1886abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1887aa372e3fSPaul Mullowney 1888aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 188957d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1890aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1891aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1892afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1893afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1894afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1895afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 189657d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1897aa372e3fSPaul Mullowney /* assign the pointer */ 1898aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1899aa372e3fSPaul Mullowney 1900afb2bd1cSJunchao Zhang if (mat) { 1901afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1902afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1903afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1904afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1905087f3262SPaul Mullowney } 1906afb2bd1cSJunchao Zhang #endif 1907087f3262SPaul Mullowney } 1908ca45077fSPaul Mullowney 1909aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1910213423ffSJunchao Zhang if (a->compressedrow.use) { 1911213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1912aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1913aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1914213423ffSJunchao Zhang tmp = m; 1915213423ffSJunchao Zhang } else { 1916213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1917213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1918213423ffSJunchao Zhang tmp = 0; 1919213423ffSJunchao Zhang } 1920213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1921aa372e3fSPaul Mullowney 1922aa372e3fSPaul Mullowney /* assign the pointer */ 1923aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 19249ae82921SPaul Mullowney } catch(char *ex) { 19259ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 19269ae82921SPaul Mullowney } 192705035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 192885ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 192934d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 193034d6c7a5SJose E. Roman } 1931abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 19329ae82921SPaul Mullowney } 19339ae82921SPaul Mullowney PetscFunctionReturn(0); 19349ae82921SPaul Mullowney } 19359ae82921SPaul Mullowney 1936c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1937aa372e3fSPaul Mullowney { 1938aa372e3fSPaul Mullowney template <typename Tuple> 1939aa372e3fSPaul Mullowney __host__ __device__ 1940aa372e3fSPaul Mullowney void operator()(Tuple t) 1941aa372e3fSPaul Mullowney { 1942aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1943aa372e3fSPaul Mullowney } 1944aa372e3fSPaul Mullowney }; 1945aa372e3fSPaul Mullowney 19467e8381f9SStefano Zampini struct VecCUDAEquals 19477e8381f9SStefano Zampini { 19487e8381f9SStefano Zampini template <typename Tuple> 19497e8381f9SStefano Zampini __host__ __device__ 19507e8381f9SStefano Zampini void operator()(Tuple t) 19517e8381f9SStefano Zampini { 19527e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19537e8381f9SStefano Zampini } 19547e8381f9SStefano Zampini }; 19557e8381f9SStefano Zampini 1956e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1957e6e9a74fSStefano Zampini { 1958e6e9a74fSStefano Zampini template <typename Tuple> 1959e6e9a74fSStefano Zampini __host__ __device__ 1960e6e9a74fSStefano Zampini void operator()(Tuple t) 1961e6e9a74fSStefano Zampini { 1962e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1963e6e9a74fSStefano Zampini } 1964e6e9a74fSStefano Zampini }; 1965e6e9a74fSStefano Zampini 1966afb2bd1cSJunchao Zhang struct MatMatCusparse { 1967ccdfe979SStefano Zampini PetscBool cisdense; 1968ccdfe979SStefano Zampini PetscScalar *Bt; 1969ccdfe979SStefano Zampini Mat X; 1970fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1971fcdce8c4SStefano Zampini PetscLogDouble flops; 1972fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 1973b4285af6SJunchao Zhang 1974afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1975fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 1976afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1977afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1978afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1979afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1980b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1981b4285af6SJunchao Zhang void *dBuffer4; 1982b4285af6SJunchao Zhang void *dBuffer5; 1983b4285af6SJunchao Zhang #endif 1984fcdce8c4SStefano Zampini size_t mmBufferSize; 1985fcdce8c4SStefano Zampini void *mmBuffer; 1986fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1987fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 1988afb2bd1cSJunchao Zhang #endif 1989afb2bd1cSJunchao Zhang }; 1990ccdfe979SStefano Zampini 1991ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1992ccdfe979SStefano Zampini { 1993ccdfe979SStefano Zampini PetscErrorCode ierr; 1994ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1995ccdfe979SStefano Zampini cudaError_t cerr; 1996fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1997fcdce8c4SStefano Zampini cusparseStatus_t stat; 1998fcdce8c4SStefano Zampini #endif 1999ccdfe979SStefano Zampini 2000ccdfe979SStefano Zampini PetscFunctionBegin; 2001ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2002fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2003afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2004fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2005afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2006afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2007fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2008b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2009b4285af6SJunchao Zhang if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2010b4285af6SJunchao Zhang if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2011b4285af6SJunchao Zhang #endif 2012b4285af6SJunchao Zhang if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2013b4285af6SJunchao Zhang if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2014afb2bd1cSJunchao Zhang #endif 2015ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2016ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 2017ccdfe979SStefano Zampini PetscFunctionReturn(0); 2018ccdfe979SStefano Zampini } 2019ccdfe979SStefano Zampini 2020ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2021ccdfe979SStefano Zampini 2022ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2023ccdfe979SStefano Zampini { 2024ccdfe979SStefano Zampini Mat_Product *product = C->product; 2025ccdfe979SStefano Zampini Mat A,B; 2026afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2027ccdfe979SStefano Zampini PetscBool flg,biscuda; 2028ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2029ccdfe979SStefano Zampini cusparseStatus_t stat; 2030ccdfe979SStefano Zampini cusparseOperation_t opA; 2031ccdfe979SStefano Zampini const PetscScalar *barray; 2032ccdfe979SStefano Zampini PetscScalar *carray; 2033ccdfe979SStefano Zampini PetscErrorCode ierr; 2034ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2035ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2036ccdfe979SStefano Zampini CsrMatrix *csrmat; 2037ccdfe979SStefano Zampini 2038ccdfe979SStefano Zampini PetscFunctionBegin; 2039ccdfe979SStefano Zampini MatCheckProduct(C,1); 2040e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2041ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2042ccdfe979SStefano Zampini A = product->A; 2043ccdfe979SStefano Zampini B = product->B; 2044ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2045e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2046ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2047ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 2048ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2049ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2050ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2051ccdfe979SStefano Zampini switch (product->type) { 2052ccdfe979SStefano Zampini case MATPRODUCT_AB: 2053ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2054ccdfe979SStefano Zampini mat = cusp->mat; 2055ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2056ccdfe979SStefano Zampini m = A->rmap->n; 2057ccdfe979SStefano Zampini n = B->cmap->n; 2058ccdfe979SStefano Zampini break; 2059ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20601a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2061e6e9a74fSStefano Zampini mat = cusp->mat; 2062e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2063e6e9a74fSStefano Zampini } else { 20643606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2065ccdfe979SStefano Zampini mat = cusp->matTranspose; 2066ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2067e6e9a74fSStefano Zampini } 2068ccdfe979SStefano Zampini m = A->cmap->n; 2069ccdfe979SStefano Zampini n = B->cmap->n; 2070ccdfe979SStefano Zampini break; 2071ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2072ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2073ccdfe979SStefano Zampini mat = cusp->mat; 2074ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2075ccdfe979SStefano Zampini m = A->rmap->n; 2076ccdfe979SStefano Zampini n = B->rmap->n; 2077ccdfe979SStefano Zampini break; 2078ccdfe979SStefano Zampini default: 2079e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2080ccdfe979SStefano Zampini } 2081e8d2b73aSMark Adams if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2082ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2083ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2084ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2085afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2086ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2087afb2bd1cSJunchao Zhang 2088ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2089c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2090c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2091c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2092c8378d12SStefano Zampini } else { 2093c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2094c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2095c8378d12SStefano Zampini } 2096c8378d12SStefano Zampini 2097c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2098afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2099afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2100a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2101afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2102fcdce8c4SStefano Zampini size_t mmBufferSize; 2103afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2104afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2105afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2106afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2107afb2bd1cSJunchao Zhang } 2108c8378d12SStefano Zampini 2109afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2110afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2111afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2112afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2113afb2bd1cSJunchao Zhang } 2114afb2bd1cSJunchao Zhang 2115afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2116afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2117afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2118afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2119afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2120afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2121afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2122afb2bd1cSJunchao Zhang } 2123afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2124afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2125afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2126fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2127fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2128ee7b52eaSHong Zhang cudaError_t cerr; 2129fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2130fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2131fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2132fcdce8c4SStefano Zampini } 2133afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2134afb2bd1cSJunchao Zhang } else { 2135afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2136afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2137afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2138afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2139afb2bd1cSJunchao Zhang } 2140afb2bd1cSJunchao Zhang 2141afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2142afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2143afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2144afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2145fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2146afb2bd1cSJunchao Zhang #else 2147afb2bd1cSJunchao Zhang PetscInt k; 2148afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2149ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2150ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2151ccdfe979SStefano Zampini cublasStatus_t cerr; 2152ccdfe979SStefano Zampini 2153ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2154ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2155ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2156ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2157ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2158ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2159ccdfe979SStefano Zampini blda = B->cmap->n; 2160afb2bd1cSJunchao Zhang k = B->cmap->n; 2161afb2bd1cSJunchao Zhang } else { 2162afb2bd1cSJunchao Zhang k = B->rmap->n; 2163ccdfe979SStefano Zampini } 2164ccdfe979SStefano Zampini 2165afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2166ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2167afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2168ccdfe979SStefano Zampini csrmat->values->data().get(), 2169ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2170ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2171ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2172ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2173afb2bd1cSJunchao Zhang #endif 2174c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2175c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2176ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2177ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2178ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2179ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2180ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2181ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2182ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2183ccdfe979SStefano Zampini } else { 2184ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2185ccdfe979SStefano Zampini } 2186ccdfe979SStefano Zampini if (mmdata->cisdense) { 2187ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2188ccdfe979SStefano Zampini } 2189ccdfe979SStefano Zampini if (!biscuda) { 2190ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2191ccdfe979SStefano Zampini } 2192ccdfe979SStefano Zampini PetscFunctionReturn(0); 2193ccdfe979SStefano Zampini } 2194ccdfe979SStefano Zampini 2195ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2196ccdfe979SStefano Zampini { 2197ccdfe979SStefano Zampini Mat_Product *product = C->product; 2198ccdfe979SStefano Zampini Mat A,B; 2199ccdfe979SStefano Zampini PetscInt m,n; 2200ccdfe979SStefano Zampini PetscBool cisdense,flg; 2201ccdfe979SStefano Zampini PetscErrorCode ierr; 2202ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2203ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2204ccdfe979SStefano Zampini 2205ccdfe979SStefano Zampini PetscFunctionBegin; 2206ccdfe979SStefano Zampini MatCheckProduct(C,1); 2207e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2208ccdfe979SStefano Zampini A = product->A; 2209ccdfe979SStefano Zampini B = product->B; 2210ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2211e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2212ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2213e8d2b73aSMark Adams if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2214ccdfe979SStefano Zampini switch (product->type) { 2215ccdfe979SStefano Zampini case MATPRODUCT_AB: 2216ccdfe979SStefano Zampini m = A->rmap->n; 2217ccdfe979SStefano Zampini n = B->cmap->n; 2218ccdfe979SStefano Zampini break; 2219ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2220ccdfe979SStefano Zampini m = A->cmap->n; 2221ccdfe979SStefano Zampini n = B->cmap->n; 2222ccdfe979SStefano Zampini break; 2223ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2224ccdfe979SStefano Zampini m = A->rmap->n; 2225ccdfe979SStefano Zampini n = B->rmap->n; 2226ccdfe979SStefano Zampini break; 2227ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2228ccdfe979SStefano Zampini m = B->cmap->n; 2229ccdfe979SStefano Zampini n = B->cmap->n; 2230ccdfe979SStefano Zampini break; 2231ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2232ccdfe979SStefano Zampini m = B->rmap->n; 2233ccdfe979SStefano Zampini n = B->rmap->n; 2234ccdfe979SStefano Zampini break; 2235ccdfe979SStefano Zampini default: 2236e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2237ccdfe979SStefano Zampini } 2238ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2239ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2240ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2241ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2242ccdfe979SStefano Zampini 2243ccdfe979SStefano Zampini /* product data */ 2244ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2245ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2246afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2247afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2248ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2249afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2250ccdfe979SStefano Zampini } 2251afb2bd1cSJunchao Zhang #endif 2252ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2253ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2254ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2255ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2256ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2257ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2258ccdfe979SStefano Zampini } else { 2259ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2260ccdfe979SStefano Zampini } 2261ccdfe979SStefano Zampini } 2262ccdfe979SStefano Zampini C->product->data = mmdata; 2263ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2264ccdfe979SStefano Zampini 2265ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2266ccdfe979SStefano Zampini PetscFunctionReturn(0); 2267ccdfe979SStefano Zampini } 2268ccdfe979SStefano Zampini 2269fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2270ccdfe979SStefano Zampini { 2271ccdfe979SStefano Zampini Mat_Product *product = C->product; 2272fcdce8c4SStefano Zampini Mat A,B; 2273fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2274fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2275fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2276fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2277fcdce8c4SStefano Zampini PetscBool flg; 2278ccdfe979SStefano Zampini PetscErrorCode ierr; 2279fcdce8c4SStefano Zampini cusparseStatus_t stat; 2280fcdce8c4SStefano Zampini cudaError_t cerr; 2281fcdce8c4SStefano Zampini MatProductType ptype; 2282fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2283fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2284fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2285fcdce8c4SStefano Zampini #endif 2286b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2287ccdfe979SStefano Zampini 2288ccdfe979SStefano Zampini PetscFunctionBegin; 2289ccdfe979SStefano Zampini MatCheckProduct(C,1); 2290e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2291fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2292e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2293fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2294fcdce8c4SStefano Zampini A = product->A; 2295fcdce8c4SStefano Zampini B = product->B; 2296fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2297fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2298fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2299e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2300fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2301e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2302fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2303e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2304fcdce8c4SStefano Zampini goto finalize; 2305fcdce8c4SStefano Zampini } 2306fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2307fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2308e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2309fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2310e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2311fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2312fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2313fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2314fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2315fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2316e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2317e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2318e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2319fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2320fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2321fcdce8c4SStefano Zampini 2322fcdce8c4SStefano Zampini ptype = product->type; 2323fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2324fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2325fa046f9fSJunchao Zhang if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2326fa046f9fSJunchao Zhang } 2327fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2328fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2329fa046f9fSJunchao Zhang if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2330fa046f9fSJunchao Zhang } 2331fcdce8c4SStefano Zampini switch (ptype) { 2332fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2333fcdce8c4SStefano Zampini Amat = Acusp->mat; 2334fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2335fcdce8c4SStefano Zampini break; 2336fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2337fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2338fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2339fcdce8c4SStefano Zampini break; 2340fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2341fcdce8c4SStefano Zampini Amat = Acusp->mat; 2342fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2343fcdce8c4SStefano Zampini break; 2344fcdce8c4SStefano Zampini default: 2345e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2346fcdce8c4SStefano Zampini } 2347fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2348e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2349e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2350e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2351fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2352fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2353fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2354e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2355e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2356e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2357fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2358fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2359fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2360b4285af6SJunchao Zhang stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2361b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2362b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2363b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2364b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2365b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2366b4285af6SJunchao Zhang #else 2367b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2368fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2369fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2370fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2371b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2372fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2373fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2374b4285af6SJunchao Zhang #endif 2375fcdce8c4SStefano Zampini #else 2376b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2377fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2378fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2379fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2380fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2381fcdce8c4SStefano Zampini #endif 2382fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2383fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2384fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2385fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2386fcdce8c4SStefano Zampini finalize: 2387fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2388fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2389fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2390fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2391fcdce8c4SStefano Zampini c->reallocs = 0; 2392fcdce8c4SStefano Zampini C->info.mallocs += 0; 2393fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2394fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2395fcdce8c4SStefano Zampini C->num_ass++; 2396ccdfe979SStefano Zampini PetscFunctionReturn(0); 2397ccdfe979SStefano Zampini } 2398fcdce8c4SStefano Zampini 2399fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2400fcdce8c4SStefano Zampini { 2401fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2402fcdce8c4SStefano Zampini Mat A,B; 2403fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2404fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2405fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2406fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2407fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2408fcdce8c4SStefano Zampini PetscBool flg; 2409fcdce8c4SStefano Zampini PetscErrorCode ierr; 2410fcdce8c4SStefano Zampini cusparseStatus_t stat; 2411fcdce8c4SStefano Zampini cudaError_t cerr; 2412fcdce8c4SStefano Zampini MatProductType ptype; 2413fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2414fcdce8c4SStefano Zampini PetscLogDouble flops; 2415fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2416fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2417fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2418fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2419fcdce8c4SStefano Zampini #else 2420fcdce8c4SStefano Zampini int cnz; 2421fcdce8c4SStefano Zampini #endif 2422b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2423fcdce8c4SStefano Zampini 2424fcdce8c4SStefano Zampini PetscFunctionBegin; 2425fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2426e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2427fcdce8c4SStefano Zampini A = product->A; 2428fcdce8c4SStefano Zampini B = product->B; 2429fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2430e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2431fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2432e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2433fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2434fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2435fcdce8c4SStefano Zampini /* product data */ 2436fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2437fcdce8c4SStefano Zampini C->product->data = mmdata; 2438fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2439fcdce8c4SStefano Zampini 2440fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2441fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2442d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2443d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2444d60bce21SJunchao Zhang if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2445d60bce21SJunchao Zhang if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2446d60bce21SJunchao Zhang 2447fcdce8c4SStefano Zampini ptype = product->type; 2448fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2449fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2450fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2451fa046f9fSJunchao Zhang } 2452fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2453fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2454fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2455fa046f9fSJunchao Zhang } 2456fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2457fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2458fcdce8c4SStefano Zampini switch (ptype) { 2459fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2460fcdce8c4SStefano Zampini m = A->rmap->n; 2461fcdce8c4SStefano Zampini n = B->cmap->n; 2462fcdce8c4SStefano Zampini k = A->cmap->n; 2463fcdce8c4SStefano Zampini Amat = Acusp->mat; 2464fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2465fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2466fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2467fcdce8c4SStefano Zampini break; 2468fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2469fcdce8c4SStefano Zampini m = A->cmap->n; 2470fcdce8c4SStefano Zampini n = B->cmap->n; 2471fcdce8c4SStefano Zampini k = A->rmap->n; 24723606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2473fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2474fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2475fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2476fcdce8c4SStefano Zampini break; 2477fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2478fcdce8c4SStefano Zampini m = A->rmap->n; 2479fcdce8c4SStefano Zampini n = B->rmap->n; 2480fcdce8c4SStefano Zampini k = A->cmap->n; 24813606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2482fcdce8c4SStefano Zampini Amat = Acusp->mat; 2483fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2484fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2485fcdce8c4SStefano Zampini break; 2486fcdce8c4SStefano Zampini default: 2487e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2488fcdce8c4SStefano Zampini } 2489fcdce8c4SStefano Zampini 2490fcdce8c4SStefano Zampini /* create cusparse matrix */ 2491fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2492fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2493fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2494fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2495fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2496fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2497fcdce8c4SStefano Zampini 2498fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2499fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2500fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2501fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2502fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2503fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2504fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2505fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2506fcdce8c4SStefano Zampini } else { 2507fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2508fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2509fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2510fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2511fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2512fcdce8c4SStefano Zampini } 2513fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2514fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2515fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2516fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2517fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2518fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2519fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2520fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2521fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2522fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2523fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2524fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2525fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2526fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2527fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2528fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2529fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2530fcdce8c4SStefano Zampini c->nz = 0; 2531fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2532fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2533fcdce8c4SStefano Zampini goto finalizesym; 2534fcdce8c4SStefano Zampini } 2535fcdce8c4SStefano Zampini 2536e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2537e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2538fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2539fcdce8c4SStefano Zampini if (!biscompressed) { 2540fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2541fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2542fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2543fcdce8c4SStefano Zampini #endif 2544fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2545fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2546fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2547fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2548fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2549fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2550fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2551fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2552fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2553fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2554fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2555fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2556fcdce8c4SStefano Zampini } 2557fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2558fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2559fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2560fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2561fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2562fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2563fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2564fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2565fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2566fcdce8c4SStefano Zampini } 2567fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2568fcdce8c4SStefano Zampini #endif 2569fcdce8c4SStefano Zampini } 2570e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2571e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2572fcdce8c4SStefano Zampini /* precompute flops count */ 2573fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2574fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2575fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2576fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2577fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2578fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2579fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2580fcdce8c4SStefano Zampini } 2581fcdce8c4SStefano Zampini } 2582fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2583fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2584fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2585fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2586fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2587fcdce8c4SStefano Zampini } 2588fcdce8c4SStefano Zampini } else { /* TODO */ 2589fcdce8c4SStefano Zampini flops = 0.; 2590fcdce8c4SStefano Zampini } 2591fcdce8c4SStefano Zampini 2592fcdce8c4SStefano Zampini mmdata->flops = flops; 2593fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2594b4285af6SJunchao Zhang 2595fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2596fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2597fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2598fcdce8c4SStefano Zampini NULL, NULL, NULL, 2599fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2600fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2601fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2602b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2603b4285af6SJunchao Zhang { 2604b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2605b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2606b4285af6SJunchao Zhang */ 2607b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2608b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2609b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2610b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2611b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2612b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2613b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2614b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2615b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2616b4285af6SJunchao Zhang 2617b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2618b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2619b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2620b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2621b4285af6SJunchao Zhang &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2622b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2623b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2624b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2625b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2626b4285af6SJunchao Zhang &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2627b4285af6SJunchao Zhang 2628b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2629b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2630b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2631b4285af6SJunchao Zhang &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2632b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2633b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2634b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2635b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2636b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2637b4285af6SJunchao Zhang &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2638b4285af6SJunchao Zhang cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2639b4285af6SJunchao Zhang cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2640b4285af6SJunchao Zhang 2641b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2642b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 2643b4285af6SJunchao Zhang stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2644b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2645b4285af6SJunchao Zhang /* allocate matrix C */ 2646b4285af6SJunchao Zhang Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2647b4285af6SJunchao Zhang Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2648b4285af6SJunchao Zhang /* update matC with the new pointers */ 2649b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2650b4285af6SJunchao Zhang Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2651b4285af6SJunchao Zhang 2652b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2653b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2654b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2655b4285af6SJunchao Zhang &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2656b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2657b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2658b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2659b4285af6SJunchao Zhang &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2660b4285af6SJunchao Zhang cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2661b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2662b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2663b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2664b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2665b4285af6SJunchao Zhang ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2666b4285af6SJunchao Zhang } 2667b4285af6SJunchao Zhang #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2668b4285af6SJunchao Zhang size_t bufSize2; 2669fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2670b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2671fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2672fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2673fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2674bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2675fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2676b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2677fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2678fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2679fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2680fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2681b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2682fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2684fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2685fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2686fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2687fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2688fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2689fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2690bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2691fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2692b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2693fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2694fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2695fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2696fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2697fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2698fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 269900702c57SStefano Zampini ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2700fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2701fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2702fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2703fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2704fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2705fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2706b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2707fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2708fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2709b4285af6SJunchao Zhang #endif 2710fcdce8c4SStefano Zampini #else 2711fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2712b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2713fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2714fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2715fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2716fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2717fcdce8c4SStefano Zampini c->nz = cnz; 2718fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2719fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2720fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2721fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2722fcdce8c4SStefano Zampini 2723fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2724fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2725fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2726fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2727b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2728fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2729fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2730fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2731fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2732fcdce8c4SStefano Zampini #endif 2733fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2734fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2735fcdce8c4SStefano Zampini finalizesym: 2736fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2737fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2738fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2739fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2740fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2741fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2742fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2743fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2744fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2745fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2746fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2747fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2748fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2749fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2750fcdce8c4SStefano Zampini } else { 2751fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2752fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2753fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2754fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2755fcdce8c4SStefano Zampini } 2756fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2757fcdce8c4SStefano Zampini PetscInt r = 0; 2758fcdce8c4SStefano Zampini c->i[0] = 0; 2759fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2760fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2761fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2762fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2763fcdce8c4SStefano Zampini } 2764fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2765fcdce8c4SStefano Zampini } 2766fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2767fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2768fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2769fcdce8c4SStefano Zampini c->maxnz = c->nz; 2770fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2771fcdce8c4SStefano Zampini c->rmax = 0; 2772fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2773fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2774fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2775fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2776fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2777fcdce8c4SStefano Zampini } 2778fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2779fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2780fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2781fcdce8c4SStefano Zampini 2782fcdce8c4SStefano Zampini C->nonzerostate++; 2783fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2784fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2785fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2786fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2787fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2788fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2789fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2790abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2791fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2792fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2793fcdce8c4SStefano Zampini } 2794fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2795fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2796fcdce8c4SStefano Zampini } 2797fcdce8c4SStefano Zampini 2798fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2799fcdce8c4SStefano Zampini 2800fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2801fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2802fcdce8c4SStefano Zampini { 2803fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2804fcdce8c4SStefano Zampini PetscErrorCode ierr; 2805fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2806fcdce8c4SStefano Zampini 2807fcdce8c4SStefano Zampini PetscFunctionBegin; 2808fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2809fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2810abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2811fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2812fcdce8c4SStefano Zampini } 2813fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2814fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2815fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2816fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2817fcdce8c4SStefano Zampini } 2818fcdce8c4SStefano Zampini } 281965e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 282065e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 282165e4b4d4SStefano Zampini switch (product->type) { 282265e4b4d4SStefano Zampini case MATPRODUCT_AB: 282365e4b4d4SStefano Zampini if (product->api_user) { 282465e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 282565e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 282665e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 282765e4b4d4SStefano Zampini } else { 282865e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 282965e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 283065e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 283165e4b4d4SStefano Zampini } 283265e4b4d4SStefano Zampini break; 283365e4b4d4SStefano Zampini case MATPRODUCT_AtB: 283465e4b4d4SStefano Zampini if (product->api_user) { 283565e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 283665e4b4d4SStefano Zampini ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 283765e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 283865e4b4d4SStefano Zampini } else { 283965e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 284065e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 284165e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 284265e4b4d4SStefano Zampini } 284365e4b4d4SStefano Zampini break; 284465e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 284565e4b4d4SStefano Zampini if (product->api_user) { 284665e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 284765e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 284865e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 284965e4b4d4SStefano Zampini } else { 285065e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 285165e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 285265e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 285365e4b4d4SStefano Zampini } 285465e4b4d4SStefano Zampini break; 285565e4b4d4SStefano Zampini case MATPRODUCT_RARt: 285665e4b4d4SStefano Zampini if (product->api_user) { 285765e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 285865e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 285965e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 286065e4b4d4SStefano Zampini } else { 286165e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 286265e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 286365e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 286465e4b4d4SStefano Zampini } 286565e4b4d4SStefano Zampini break; 286665e4b4d4SStefano Zampini case MATPRODUCT_ABC: 286765e4b4d4SStefano Zampini if (product->api_user) { 286865e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 286965e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 287065e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 287165e4b4d4SStefano Zampini } else { 287265e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 287365e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 287465e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 287565e4b4d4SStefano Zampini } 287665e4b4d4SStefano Zampini break; 287765e4b4d4SStefano Zampini default: 287865e4b4d4SStefano Zampini break; 287965e4b4d4SStefano Zampini } 288065e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 288165e4b4d4SStefano Zampini } 288265e4b4d4SStefano Zampini /* dispatch */ 2883fcdce8c4SStefano Zampini if (isdense) { 2884ccdfe979SStefano Zampini switch (product->type) { 2885ccdfe979SStefano Zampini case MATPRODUCT_AB: 2886ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2887ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2888ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2889ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2890fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2891fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2892fcdce8c4SStefano Zampini } else { 2893fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2894fcdce8c4SStefano Zampini } 2895fcdce8c4SStefano Zampini break; 2896fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2897fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2898fcdce8c4SStefano Zampini break; 2899ccdfe979SStefano Zampini default: 2900ccdfe979SStefano Zampini break; 2901ccdfe979SStefano Zampini } 2902fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2903fcdce8c4SStefano Zampini switch (product->type) { 2904fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2905fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2906fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2907fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2908fcdce8c4SStefano Zampini break; 2909fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2910fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2911fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2912fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2913fcdce8c4SStefano Zampini break; 2914fcdce8c4SStefano Zampini default: 2915fcdce8c4SStefano Zampini break; 2916fcdce8c4SStefano Zampini } 2917fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 2918fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2919fcdce8c4SStefano Zampini } 2920ccdfe979SStefano Zampini PetscFunctionReturn(0); 2921ccdfe979SStefano Zampini } 2922ccdfe979SStefano Zampini 29236fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 29249ae82921SPaul Mullowney { 2925b175d8bbSPaul Mullowney PetscErrorCode ierr; 29269ae82921SPaul Mullowney 29279ae82921SPaul Mullowney PetscFunctionBegin; 2928e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2929e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2930e6e9a74fSStefano Zampini } 2931e6e9a74fSStefano Zampini 2932e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2933e6e9a74fSStefano Zampini { 2934e6e9a74fSStefano Zampini PetscErrorCode ierr; 2935e6e9a74fSStefano Zampini 2936e6e9a74fSStefano Zampini PetscFunctionBegin; 2937e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2938e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2939e6e9a74fSStefano Zampini } 2940e6e9a74fSStefano Zampini 2941e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2942e6e9a74fSStefano Zampini { 2943e6e9a74fSStefano Zampini PetscErrorCode ierr; 2944e6e9a74fSStefano Zampini 2945e6e9a74fSStefano Zampini PetscFunctionBegin; 2946e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2947e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2948e6e9a74fSStefano Zampini } 2949e6e9a74fSStefano Zampini 2950e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2951e6e9a74fSStefano Zampini { 2952e6e9a74fSStefano Zampini PetscErrorCode ierr; 2953e6e9a74fSStefano Zampini 2954e6e9a74fSStefano Zampini PetscFunctionBegin; 2955e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 29569ae82921SPaul Mullowney PetscFunctionReturn(0); 29579ae82921SPaul Mullowney } 29589ae82921SPaul Mullowney 29596fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2960ca45077fSPaul Mullowney { 2961b175d8bbSPaul Mullowney PetscErrorCode ierr; 2962ca45077fSPaul Mullowney 2963ca45077fSPaul Mullowney PetscFunctionBegin; 2964e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2965ca45077fSPaul Mullowney PetscFunctionReturn(0); 2966ca45077fSPaul Mullowney } 2967ca45077fSPaul Mullowney 2968a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2969a0e72f99SJunchao Zhang { 2970a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2971a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2972a0e72f99SJunchao Zhang } 2973a0e72f99SJunchao Zhang 2974afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2975e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 29769ae82921SPaul Mullowney { 29779ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2978aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 29799ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2980e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2981b175d8bbSPaul Mullowney PetscErrorCode ierr; 2982aa372e3fSPaul Mullowney cusparseStatus_t stat; 2983e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2984e6e9a74fSStefano Zampini PetscBool compressed; 2985afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2986afb2bd1cSJunchao Zhang PetscInt nx,ny; 2987afb2bd1cSJunchao Zhang #endif 29886e111a19SKarl Rupp 29899ae82921SPaul Mullowney PetscFunctionBegin; 2990e8d2b73aSMark Adams if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2991e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2992afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2993d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2994e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2995e6e9a74fSStefano Zampini } 299634d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 299734d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2998e6e9a74fSStefano Zampini if (!trans) { 29999ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3000e8d2b73aSMark Adams if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3001e6e9a74fSStefano Zampini } else { 30021a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3003e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3004e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3005e6e9a74fSStefano Zampini } else { 30063606e59fSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3007e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3008e6e9a74fSStefano Zampini } 3009e6e9a74fSStefano Zampini } 3010e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3011e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3012213423ffSJunchao Zhang 3013e6e9a74fSStefano Zampini try { 3014e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3015213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3016213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3017afb2bd1cSJunchao Zhang 301885ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3019e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3020afb2bd1cSJunchao Zhang /* z = A x + beta y. 3021afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3022afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3023afb2bd1cSJunchao Zhang */ 3024e6e9a74fSStefano Zampini xptr = xarray; 3025afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3026213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3027afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3028afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3029afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3030afb2bd1cSJunchao Zhang */ 3031afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3032afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3033afb2bd1cSJunchao Zhang nx = mat->num_cols; 3034afb2bd1cSJunchao Zhang ny = mat->num_rows; 3035afb2bd1cSJunchao Zhang } 3036afb2bd1cSJunchao Zhang #endif 3037e6e9a74fSStefano Zampini } else { 3038afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3039afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3040afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3041afb2bd1cSJunchao Zhang */ 3042afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3043e6e9a74fSStefano Zampini dptr = zarray; 3044e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3045afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3046e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3047a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3048e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3049e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3050e6e9a74fSStefano Zampini } 3051afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3052afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3053afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3054afb2bd1cSJunchao Zhang nx = mat->num_rows; 3055afb2bd1cSJunchao Zhang ny = mat->num_cols; 3056afb2bd1cSJunchao Zhang } 3057afb2bd1cSJunchao Zhang #endif 3058e6e9a74fSStefano Zampini } 30599ae82921SPaul Mullowney 3060afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3061aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3062afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3063afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3064afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3065ee7b52eaSHong Zhang cudaError_t cerr; 3066afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3067afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3068afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3069afb2bd1cSJunchao Zhang matstruct->matDescr, 3070afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3071afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3072afb2bd1cSJunchao Zhang cusparse_scalartype, 3073afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3074afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3075afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3076afb2bd1cSJunchao Zhang 3077afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3078afb2bd1cSJunchao Zhang } else { 3079afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3080afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3081afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3082afb2bd1cSJunchao Zhang } 3083afb2bd1cSJunchao Zhang 3084afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 3085afb2bd1cSJunchao Zhang matstruct->alpha_one, 30863606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3087afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3088afb2bd1cSJunchao Zhang beta, 3089afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3090afb2bd1cSJunchao Zhang cusparse_scalartype, 3091afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3092afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3093afb2bd1cSJunchao Zhang #else 30947656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3095e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3096a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3097afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3098aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3099e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 310057d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3101afb2bd1cSJunchao Zhang #endif 3102aa372e3fSPaul Mullowney } else { 3103213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3104afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3105afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3106afb2bd1cSJunchao Zhang #else 3107301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3108e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3109afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3110e6e9a74fSStefano Zampini xptr, beta, 311157d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3112afb2bd1cSJunchao Zhang #endif 3113a65300a6SPaul Mullowney } 3114aa372e3fSPaul Mullowney } 3115958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3116aa372e3fSPaul Mullowney 3117e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3118213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3119213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3120213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3121e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3122213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 31237656d835SStefano Zampini } 3124213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3125c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 31267656d835SStefano Zampini } 31277656d835SStefano Zampini 3128213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3129213423ffSJunchao Zhang if (compressed) { 3130e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3131a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3132a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3133a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3134a0e72f99SJunchao Zhang */ 3135a0e72f99SJunchao Zhang #if 0 3136a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3137a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3138a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3139e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3140c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3141a0e72f99SJunchao Zhang #else 3142a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3143a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3144a0e72f99SJunchao Zhang #endif 3145958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3146e6e9a74fSStefano Zampini } 3147e6e9a74fSStefano Zampini } else { 3148e6e9a74fSStefano Zampini if (yy && yy != zz) { 3149e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3150e6e9a74fSStefano Zampini } 3151e6e9a74fSStefano Zampini } 3152e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3153213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3154213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 31559ae82921SPaul Mullowney } catch(char *ex) { 31569ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 31579ae82921SPaul Mullowney } 3158e6e9a74fSStefano Zampini if (yy) { 3159958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3160e6e9a74fSStefano Zampini } else { 3161e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3162e6e9a74fSStefano Zampini } 31639ae82921SPaul Mullowney PetscFunctionReturn(0); 31649ae82921SPaul Mullowney } 31659ae82921SPaul Mullowney 31666fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3167ca45077fSPaul Mullowney { 3168b175d8bbSPaul Mullowney PetscErrorCode ierr; 31696e111a19SKarl Rupp 3170ca45077fSPaul Mullowney PetscFunctionBegin; 3171e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3172ca45077fSPaul Mullowney PetscFunctionReturn(0); 3173ca45077fSPaul Mullowney } 3174ca45077fSPaul Mullowney 31756fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 31769ae82921SPaul Mullowney { 31779ae82921SPaul Mullowney PetscErrorCode ierr; 3178042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3179042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 31803fa6b06aSMark Adams 3181042217e8SBarry Smith PetscFunctionBegin; 3182042217e8SBarry Smith ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3183042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3184042217e8SBarry Smith cudaError_t cerr; 3185042217e8SBarry Smith 3186042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3187042217e8SBarry Smith cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3188042217e8SBarry Smith cusp->deviceMat = NULL; 3189042217e8SBarry Smith } 31909ae82921SPaul Mullowney PetscFunctionReturn(0); 31919ae82921SPaul Mullowney } 31929ae82921SPaul Mullowney 31939ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3194e057df02SPaul Mullowney /*@ 31959ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3196e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3197e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3198e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3199e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3200e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32019ae82921SPaul Mullowney 3202d083f849SBarry Smith Collective 32039ae82921SPaul Mullowney 32049ae82921SPaul Mullowney Input Parameters: 32059ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32069ae82921SPaul Mullowney . m - number of rows 32079ae82921SPaul Mullowney . n - number of columns 32089ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32099ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32100298fd71SBarry Smith (possibly different for each row) or NULL 32119ae82921SPaul Mullowney 32129ae82921SPaul Mullowney Output Parameter: 32139ae82921SPaul Mullowney . A - the matrix 32149ae82921SPaul Mullowney 32159ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 32169ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 32179ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 32189ae82921SPaul Mullowney 32199ae82921SPaul Mullowney Notes: 32209ae82921SPaul Mullowney If nnz is given then nz is ignored 32219ae82921SPaul Mullowney 32229ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 32239ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 32249ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 32259ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 32269ae82921SPaul Mullowney 32279ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 32280298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 32299ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 32309ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 32319ae82921SPaul Mullowney 32329ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 32339ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 32349ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 32359ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 32369ae82921SPaul Mullowney 32379ae82921SPaul Mullowney Level: intermediate 32389ae82921SPaul Mullowney 3239e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 32409ae82921SPaul Mullowney @*/ 32419ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 32429ae82921SPaul Mullowney { 32439ae82921SPaul Mullowney PetscErrorCode ierr; 32449ae82921SPaul Mullowney 32459ae82921SPaul Mullowney PetscFunctionBegin; 32469ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 32479ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 32489ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32499ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 32509ae82921SPaul Mullowney PetscFunctionReturn(0); 32519ae82921SPaul Mullowney } 32529ae82921SPaul Mullowney 32536fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 32549ae82921SPaul Mullowney { 32559ae82921SPaul Mullowney PetscErrorCode ierr; 3256ab25e6cbSDominic Meiser 32579ae82921SPaul Mullowney PetscFunctionBegin; 32589ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3259470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 32609ae82921SPaul Mullowney } else { 3261470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3262aa372e3fSPaul Mullowney } 3263c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3264ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3265ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3266ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3267fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3268ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 32697e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 32707e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3271ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 32729ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 32739ae82921SPaul Mullowney PetscFunctionReturn(0); 32749ae82921SPaul Mullowney } 32759ae82921SPaul Mullowney 3276ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 327795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 32789ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 32799ff858a8SKarl Rupp { 32809ff858a8SKarl Rupp PetscErrorCode ierr; 32819ff858a8SKarl Rupp 32829ff858a8SKarl Rupp PetscFunctionBegin; 32839ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3284ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 32859ff858a8SKarl Rupp PetscFunctionReturn(0); 32869ff858a8SKarl Rupp } 32879ff858a8SKarl Rupp 3288039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 328995639643SRichard Tran Mills { 3290e6e9a74fSStefano Zampini PetscErrorCode ierr; 3291a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3292039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3293039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3294039c6fbaSStefano Zampini PetscScalar *ay; 3295039c6fbaSStefano Zampini const PetscScalar *ax; 3296039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3297e6e9a74fSStefano Zampini 329895639643SRichard Tran Mills PetscFunctionBegin; 3299a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3300a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3301039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3302a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3303a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3304a587d139SMark PetscFunctionReturn(0); 330595639643SRichard Tran Mills } 3306039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3307a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3308a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3309e8d2b73aSMark Adams if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3310e8d2b73aSMark Adams if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3311039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3312039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3313039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3314039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3315039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3316039c6fbaSStefano Zampini if (eq) { 3317039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3318039c6fbaSStefano Zampini } 3319039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3320039c6fbaSStefano Zampini } 3321d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3322d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3323039c6fbaSStefano Zampini 3324039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3325039c6fbaSStefano Zampini cusparseStatus_t stat; 3326039c6fbaSStefano Zampini PetscScalar b = 1.0; 3327039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3328039c6fbaSStefano Zampini size_t bufferSize; 3329039c6fbaSStefano Zampini void *buffer; 3330ee7b52eaSHong Zhang cudaError_t cerr; 3331039c6fbaSStefano Zampini #endif 3332039c6fbaSStefano Zampini 3333039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3334039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3335039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3336039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3337039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3338039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3339039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3340039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3341039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3342039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3343039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3344039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3345039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3346039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3347039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3348039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3349039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3350039c6fbaSStefano Zampini #else 3351039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3352039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3353039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3354039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3355039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3356039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3357039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3358039c6fbaSStefano Zampini #endif 3359039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3360039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3361039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3362039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3363039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3364a587d139SMark cublasHandle_t cublasv2handle; 3365039c6fbaSStefano Zampini cublasStatus_t berr; 3366a587d139SMark PetscBLASInt one = 1, bnz = 1; 3367039c6fbaSStefano Zampini 3368039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3369039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3370a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3371a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3372a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3373039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3374a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3375a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3376039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3377039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3378a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3379039c6fbaSStefano Zampini } else { 3380a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3381d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3382a587d139SMark } 338395639643SRichard Tran Mills PetscFunctionReturn(0); 338495639643SRichard Tran Mills } 338595639643SRichard Tran Mills 338633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 338733c9ba73SStefano Zampini { 338833c9ba73SStefano Zampini PetscErrorCode ierr; 338933c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 339033c9ba73SStefano Zampini PetscScalar *ay; 339133c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 339233c9ba73SStefano Zampini cublasStatus_t berr; 339333c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 339433c9ba73SStefano Zampini 339533c9ba73SStefano Zampini PetscFunctionBegin; 339633c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 339733c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 339833c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 339933c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 340033c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 340133c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 340233c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 340333c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 340433c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 340533c9ba73SStefano Zampini PetscFunctionReturn(0); 340633c9ba73SStefano Zampini } 340733c9ba73SStefano Zampini 34083fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34093fa6b06aSMark Adams { 34103fa6b06aSMark Adams PetscErrorCode ierr; 34117e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3412a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34137e8381f9SStefano Zampini 34143fa6b06aSMark Adams PetscFunctionBegin; 34153fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 34163fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 34177e8381f9SStefano Zampini if (spptr->mat) { 34187e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 34197e8381f9SStefano Zampini if (matrix->values) { 34207e8381f9SStefano Zampini both = PETSC_TRUE; 34217e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34227e8381f9SStefano Zampini } 34237e8381f9SStefano Zampini } 34247e8381f9SStefano Zampini if (spptr->matTranspose) { 34257e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 34267e8381f9SStefano Zampini if (matrix->values) { 34277e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34287e8381f9SStefano Zampini } 34297e8381f9SStefano Zampini } 34303fa6b06aSMark Adams } 3431a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3432a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3433a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 34347e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3435a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 34363fa6b06aSMark Adams PetscFunctionReturn(0); 34373fa6b06aSMark Adams } 34383fa6b06aSMark Adams 3439a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3440a587d139SMark { 3441a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3442a587d139SMark PetscErrorCode ierr; 3443a587d139SMark 3444a587d139SMark PetscFunctionBegin; 3445a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3446a587d139SMark if (flg) { 3447a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3448a587d139SMark 344933c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3450a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3451a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3452a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3453a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3454a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3455a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3456a587d139SMark A->ops->multhermitiantranspose = NULL; 3457a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3458fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3459c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3460a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3461a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3462a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3463a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3464a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3465fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3466a587d139SMark } else { 346733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3468a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3469a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3470a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3471a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3472a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3473a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3474a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3475a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3476fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3477c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3478a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3479a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3480a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3481a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3482a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3483fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3484a587d139SMark } 3485a587d139SMark A->boundtocpu = flg; 3486a587d139SMark a->inode.use = flg; 3487a587d139SMark PetscFunctionReturn(0); 3488a587d139SMark } 3489a587d139SMark 349049735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 34919ae82921SPaul Mullowney { 34929ae82921SPaul Mullowney PetscErrorCode ierr; 3493aa372e3fSPaul Mullowney cusparseStatus_t stat; 349449735bf3SStefano Zampini Mat B; 34959ae82921SPaul Mullowney 34969ae82921SPaul Mullowney PetscFunctionBegin; 3497a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 349849735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 349949735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 350049735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 350149735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 350249735bf3SStefano Zampini } 350349735bf3SStefano Zampini B = *newmat; 350449735bf3SStefano Zampini 350534136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 350634136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 350734136279SStefano Zampini 350849735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 35099ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3510e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3511e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3512e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3513a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 35141a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3515d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3516a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3517a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3518a435da06SStefano Zampini #else 3519d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3520a435da06SStefano Zampini #endif 3521d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3522d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3523d8132acaSStefano Zampini #endif 35241a2c6b5cSJunchao Zhang B->spptr = spptr; 35259ae82921SPaul Mullowney } else { 3526e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3527e6e9a74fSStefano Zampini 3528e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3529e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3530a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3531e6e9a74fSStefano Zampini B->spptr = spptr; 35329ae82921SPaul Mullowney } 3533e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 353449735bf3SStefano Zampini } 3535693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 35369ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 35371a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 35389ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 353995639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3540693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 35412205254eSKarl Rupp 3542e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 35439ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3544bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3545ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 3546ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3547ae48a8d0SStefano Zampini #endif 35489ae82921SPaul Mullowney PetscFunctionReturn(0); 35499ae82921SPaul Mullowney } 35509ae82921SPaul Mullowney 355102fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 355202fe1965SBarry Smith { 355302fe1965SBarry Smith PetscErrorCode ierr; 355402fe1965SBarry Smith 355502fe1965SBarry Smith PetscFunctionBegin; 355602fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 35570ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 355802fe1965SBarry Smith PetscFunctionReturn(0); 355902fe1965SBarry Smith } 356002fe1965SBarry Smith 35613ca39a21SBarry Smith /*MC 3562e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3563e057df02SPaul Mullowney 3564e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 35652692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 35662692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3567e057df02SPaul Mullowney 3568e057df02SPaul Mullowney Options Database Keys: 3569e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3570aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3571a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3572e057df02SPaul Mullowney 3573e057df02SPaul Mullowney Level: beginner 3574e057df02SPaul Mullowney 35758468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3576e057df02SPaul Mullowney M*/ 35777f756511SDominic Meiser 3578bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 35790f39cd5aSBarry Smith 35803ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 358142c9c57cSBarry Smith { 358242c9c57cSBarry Smith PetscErrorCode ierr; 358342c9c57cSBarry Smith 358442c9c57cSBarry Smith PetscFunctionBegin; 3585bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 35863ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 35873ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 35883ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 35893ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3590bddcd29dSMark Adams 359142c9c57cSBarry Smith PetscFunctionReturn(0); 359242c9c57cSBarry Smith } 359329b38603SBarry Smith 3594470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 35957f756511SDominic Meiser { 3596e6e9a74fSStefano Zampini PetscErrorCode ierr; 35977f756511SDominic Meiser cusparseStatus_t stat; 35987f756511SDominic Meiser 35997f756511SDominic Meiser PetscFunctionBegin; 36007f756511SDominic Meiser if (*cusparsestruct) { 3601e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3602e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 36037f756511SDominic Meiser delete (*cusparsestruct)->workVector; 360481902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 36057e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 36067e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3607a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 36087e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3609e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 36107f756511SDominic Meiser } 36117f756511SDominic Meiser PetscFunctionReturn(0); 36127f756511SDominic Meiser } 36137f756511SDominic Meiser 36147f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 36157f756511SDominic Meiser { 36167f756511SDominic Meiser PetscFunctionBegin; 36177f756511SDominic Meiser if (*mat) { 36187f756511SDominic Meiser delete (*mat)->values; 36197f756511SDominic Meiser delete (*mat)->column_indices; 36207f756511SDominic Meiser delete (*mat)->row_offsets; 36217f756511SDominic Meiser delete *mat; 36227f756511SDominic Meiser *mat = 0; 36237f756511SDominic Meiser } 36247f756511SDominic Meiser PetscFunctionReturn(0); 36257f756511SDominic Meiser } 36267f756511SDominic Meiser 3627470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 36287f756511SDominic Meiser { 36297f756511SDominic Meiser cusparseStatus_t stat; 36307f756511SDominic Meiser PetscErrorCode ierr; 36317f756511SDominic Meiser 36327f756511SDominic Meiser PetscFunctionBegin; 36337f756511SDominic Meiser if (*trifactor) { 363457d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3635afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 36367f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 36371b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 36382cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3639afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 36401b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3641afb2bd1cSJunchao Zhang #endif 3642da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 36437f756511SDominic Meiser } 36447f756511SDominic Meiser PetscFunctionReturn(0); 36457f756511SDominic Meiser } 36467f756511SDominic Meiser 3647470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 36487f756511SDominic Meiser { 36497f756511SDominic Meiser CsrMatrix *mat; 36507f756511SDominic Meiser cusparseStatus_t stat; 36517f756511SDominic Meiser cudaError_t err; 36527f756511SDominic Meiser 36537f756511SDominic Meiser PetscFunctionBegin; 36547f756511SDominic Meiser if (*matstruct) { 36557f756511SDominic Meiser if ((*matstruct)->mat) { 36567f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3657afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3658afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3659afb2bd1cSJunchao Zhang #else 36607f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 366157d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3662afb2bd1cSJunchao Zhang #endif 36637f756511SDominic Meiser } else { 36647f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 36657f756511SDominic Meiser CsrMatrix_Destroy(&mat); 36667f756511SDominic Meiser } 36677f756511SDominic Meiser } 366857d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 36697f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3670afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 36717656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 36727656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3673afb2bd1cSJunchao Zhang 3674afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3675afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3676afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3677afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3678afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3679afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3680afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3681afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3682afb2bd1cSJunchao Zhang } 3683afb2bd1cSJunchao Zhang } 3684afb2bd1cSJunchao Zhang #endif 36857f756511SDominic Meiser delete *matstruct; 36867e8381f9SStefano Zampini *matstruct = NULL; 36877f756511SDominic Meiser } 36887f756511SDominic Meiser PetscFunctionReturn(0); 36897f756511SDominic Meiser } 36907f756511SDominic Meiser 3691e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 36927f756511SDominic Meiser { 3693e6e9a74fSStefano Zampini PetscErrorCode ierr; 3694e6e9a74fSStefano Zampini 36957f756511SDominic Meiser PetscFunctionBegin; 36967f756511SDominic Meiser if (*trifactors) { 3697e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3698e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3699e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3700e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 37017f756511SDominic Meiser delete (*trifactors)->rpermIndices; 37027f756511SDominic Meiser delete (*trifactors)->cpermIndices; 37037f756511SDominic Meiser delete (*trifactors)->workVector; 37047e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 37057e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 37067e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3707bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3708bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3709e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3710ccdfe979SStefano Zampini } 3711ccdfe979SStefano Zampini PetscFunctionReturn(0); 3712ccdfe979SStefano Zampini } 3713ccdfe979SStefano Zampini 3714ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3715ccdfe979SStefano Zampini { 3716e6e9a74fSStefano Zampini PetscErrorCode ierr; 3717ccdfe979SStefano Zampini cusparseHandle_t handle; 3718ccdfe979SStefano Zampini cusparseStatus_t stat; 3719ccdfe979SStefano Zampini 3720ccdfe979SStefano Zampini PetscFunctionBegin; 3721ccdfe979SStefano Zampini if (*trifactors) { 3722e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 37237f756511SDominic Meiser if (handle = (*trifactors)->handle) { 372457d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 37257f756511SDominic Meiser } 3726e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 37277f756511SDominic Meiser } 37287f756511SDominic Meiser PetscFunctionReturn(0); 37297f756511SDominic Meiser } 37307e8381f9SStefano Zampini 37317e8381f9SStefano Zampini struct IJCompare 37327e8381f9SStefano Zampini { 37337e8381f9SStefano Zampini __host__ __device__ 37347e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37357e8381f9SStefano Zampini { 37367e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 37377e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 37387e8381f9SStefano Zampini return false; 37397e8381f9SStefano Zampini } 37407e8381f9SStefano Zampini }; 37417e8381f9SStefano Zampini 37427e8381f9SStefano Zampini struct IJEqual 37437e8381f9SStefano Zampini { 37447e8381f9SStefano Zampini __host__ __device__ 37457e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37467e8381f9SStefano Zampini { 37477e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 37487e8381f9SStefano Zampini return true; 37497e8381f9SStefano Zampini } 37507e8381f9SStefano Zampini }; 37517e8381f9SStefano Zampini 37527e8381f9SStefano Zampini struct IJDiff 37537e8381f9SStefano Zampini { 37547e8381f9SStefano Zampini __host__ __device__ 37557e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37567e8381f9SStefano Zampini { 37577e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 37587e8381f9SStefano Zampini } 37597e8381f9SStefano Zampini }; 37607e8381f9SStefano Zampini 37617e8381f9SStefano Zampini struct IJSum 37627e8381f9SStefano Zampini { 37637e8381f9SStefano Zampini __host__ __device__ 37647e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37657e8381f9SStefano Zampini { 37667e8381f9SStefano Zampini return t1||t2; 37677e8381f9SStefano Zampini } 37687e8381f9SStefano Zampini }; 37697e8381f9SStefano Zampini 37707e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3771e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 37727e8381f9SStefano Zampini { 37737e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3774fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3775bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 377608391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 37777e8381f9SStefano Zampini CsrMatrix *matrix; 37787e8381f9SStefano Zampini PetscErrorCode ierr; 37797e8381f9SStefano Zampini PetscInt n; 37807e8381f9SStefano Zampini 37817e8381f9SStefano Zampini PetscFunctionBegin; 37827e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 37837e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 37847e8381f9SStefano Zampini if (!cusp->cooPerm) { 37857e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 37867e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 37877e8381f9SStefano Zampini PetscFunctionReturn(0); 37887e8381f9SStefano Zampini } 37897e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 37907e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3791e61fc153SStefano Zampini if (!v) { 3792e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3793e61fc153SStefano Zampini goto finalize; 37947e8381f9SStefano Zampini } 3795e61fc153SStefano Zampini n = cusp->cooPerm->size(); 379608391a17SStefano Zampini if (isCudaMem(v)) { 379708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 379808391a17SStefano Zampini } else { 3799e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3800e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 380108391a17SStefano Zampini d_v = cooPerm_v->data(); 3802e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 380308391a17SStefano Zampini } 3804bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3805e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3806ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3807bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 380808391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3809ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3810ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3811ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3812ddea5d60SJunchao Zhang */ 3813e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3814e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3815e61fc153SStefano Zampini delete cooPerm_w; 38167e8381f9SStefano Zampini } else { 3817ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 381808391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38197e8381f9SStefano Zampini matrix->values->begin())); 382008391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38217e8381f9SStefano Zampini matrix->values->end())); 3822ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 38237e8381f9SStefano Zampini } 38247e8381f9SStefano Zampini } else { 3825e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 382608391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3827e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 38287e8381f9SStefano Zampini } else { 382908391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38307e8381f9SStefano Zampini matrix->values->begin())); 383108391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38327e8381f9SStefano Zampini matrix->values->end())); 38337e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 38347e8381f9SStefano Zampini } 38357e8381f9SStefano Zampini } 3836bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3837e61fc153SStefano Zampini finalize: 3838e61fc153SStefano Zampini delete cooPerm_v; 38397e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3840e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3841fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3842fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3843fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3844fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3845fcdce8c4SStefano Zampini a->reallocs = 0; 3846fcdce8c4SStefano Zampini A->info.mallocs += 0; 3847fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3848fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3849fcdce8c4SStefano Zampini A->num_ass++; 38507e8381f9SStefano Zampini PetscFunctionReturn(0); 38517e8381f9SStefano Zampini } 38527e8381f9SStefano Zampini 3853a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3854a49f1ed0SStefano Zampini { 3855a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3856a49f1ed0SStefano Zampini PetscErrorCode ierr; 3857a49f1ed0SStefano Zampini 3858a49f1ed0SStefano Zampini PetscFunctionBegin; 3859a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3860a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3861a49f1ed0SStefano Zampini if (destroy) { 3862a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3863a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3864a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3865a49f1ed0SStefano Zampini } 38661a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3867a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3868a49f1ed0SStefano Zampini } 3869a49f1ed0SStefano Zampini 38707e8381f9SStefano Zampini #include <thrust/binary_search.h> 3871e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 38727e8381f9SStefano Zampini { 38737e8381f9SStefano Zampini PetscErrorCode ierr; 38747e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 38757e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 38767e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 38777e8381f9SStefano Zampini cudaError_t cerr; 38787e8381f9SStefano Zampini 38797e8381f9SStefano Zampini PetscFunctionBegin; 38807e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 38817e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 38827e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 38837e8381f9SStefano Zampini if (n != cooPerm_n) { 38847e8381f9SStefano Zampini delete cusp->cooPerm; 38857e8381f9SStefano Zampini delete cusp->cooPerm_a; 38867e8381f9SStefano Zampini cusp->cooPerm = NULL; 38877e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 38887e8381f9SStefano Zampini } 38897e8381f9SStefano Zampini if (n) { 38907e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 38917e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 38927e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 38937e8381f9SStefano Zampini 38947e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 38957e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 38967e8381f9SStefano Zampini 38977e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 38987e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 38997e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 3900ddea5d60SJunchao Zhang 3901ddea5d60SJunchao Zhang /* Ex. 3902ddea5d60SJunchao Zhang n = 6 3903ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 3904ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 3905ddea5d60SJunchao Zhang */ 39067e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 39077e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 39087e8381f9SStefano Zampini 390908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 39107e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3911ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3912ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 39137e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 39147e8381f9SStefano Zampini 3915ddea5d60SJunchao Zhang /* 3916ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 3917ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 3918ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 3919ddea5d60SJunchao Zhang */ 3920ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3921ddea5d60SJunchao Zhang 3922ddea5d60SJunchao Zhang /* 3923ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 3924ddea5d60SJunchao Zhang ^ekey 3925ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 3926ddea5d60SJunchao Zhang ^nekye 3927ddea5d60SJunchao Zhang */ 39287e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 39297e8381f9SStefano Zampini delete cusp->cooPerm_a; 39307e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 3931ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3932ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3933ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3934ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3935ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 39367e8381f9SStefano Zampini w[0] = 0; 3937ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3938ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 39397e8381f9SStefano Zampini } 39407e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 3941ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3942ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3943ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 394408391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 39457e8381f9SStefano Zampini 39467e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 39477e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 39487e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 39497e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 39507e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3951ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 39527e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 39537e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3954fcdce8c4SStefano Zampini a->rmax = 0; 39557e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 39567e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 39577e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 39587e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 39597e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 39607e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 39617e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 39627e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 39637e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3964fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 39657e8381f9SStefano Zampini } 3966fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 39677e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 39687e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3969fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 39707e8381f9SStefano Zampini } else { 39717e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 39727e8381f9SStefano Zampini } 3973e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 39747e8381f9SStefano Zampini 39757e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3976e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 3977e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 39787e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 39797e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 39807e8381f9SStefano Zampini A->nonzerostate++; 39817e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3982a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 39837e8381f9SStefano Zampini 39847e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 39857e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 39867e8381f9SStefano Zampini PetscFunctionReturn(0); 39877e8381f9SStefano Zampini } 3988ed502f03SStefano Zampini 39895b7e41feSStefano Zampini /*@C 39905b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 39915b7e41feSStefano Zampini 39925b7e41feSStefano Zampini Not collective 39935b7e41feSStefano Zampini 39945b7e41feSStefano Zampini Input Parameters: 39955b7e41feSStefano Zampini + A - the matrix 39965b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 39975b7e41feSStefano Zampini 39985b7e41feSStefano Zampini Output Parameters: 39995b7e41feSStefano Zampini + ia - the CSR row pointers 40005b7e41feSStefano Zampini - ja - the CSR column indices 40015b7e41feSStefano Zampini 40025b7e41feSStefano Zampini Level: developer 40035b7e41feSStefano Zampini 40045b7e41feSStefano Zampini Notes: 40055b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 40065b7e41feSStefano Zampini 40075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 40085b7e41feSStefano Zampini @*/ 40095f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 40105f101d05SStefano Zampini { 40115f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 40125f101d05SStefano Zampini CsrMatrix *csr; 40135f101d05SStefano Zampini PetscErrorCode ierr; 40145f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 40155f101d05SStefano Zampini 40165f101d05SStefano Zampini PetscFunctionBegin; 40175f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 40185f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 40195f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 40205f101d05SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 40215f101d05SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 40225f101d05SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 40235f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 40245f101d05SStefano Zampini if (i) { 40255f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 40265f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 40275f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 40285f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 40295f101d05SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 40305f101d05SStefano Zampini } 40315f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 40325f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 40335f101d05SStefano Zampini } 40345f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 40355f101d05SStefano Zampini PetscFunctionReturn(0); 40365f101d05SStefano Zampini } 40375f101d05SStefano Zampini 40385b7e41feSStefano Zampini /*@C 40395b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 40405b7e41feSStefano Zampini 40415b7e41feSStefano Zampini Not collective 40425b7e41feSStefano Zampini 40435b7e41feSStefano Zampini Input Parameters: 40445b7e41feSStefano Zampini + A - the matrix 40455b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40465b7e41feSStefano Zampini 40475b7e41feSStefano Zampini Output Parameters: 40485b7e41feSStefano Zampini + ia - the CSR row pointers 40495b7e41feSStefano Zampini - ja - the CSR column indices 40505b7e41feSStefano Zampini 40515b7e41feSStefano Zampini Level: developer 40525b7e41feSStefano Zampini 40535b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 40545b7e41feSStefano Zampini @*/ 40555f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 40565f101d05SStefano Zampini { 40575f101d05SStefano Zampini PetscFunctionBegin; 40585f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 40595f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 40605f101d05SStefano Zampini if (i) *i = NULL; 40615f101d05SStefano Zampini if (j) *j = NULL; 40625f101d05SStefano Zampini PetscFunctionReturn(0); 40635f101d05SStefano Zampini } 40645f101d05SStefano Zampini 40655b7e41feSStefano Zampini /*@C 40665b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 40675b7e41feSStefano Zampini 40685b7e41feSStefano Zampini Not Collective 40695b7e41feSStefano Zampini 40705b7e41feSStefano Zampini Input Parameter: 40715b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 40725b7e41feSStefano Zampini 40735b7e41feSStefano Zampini Output Parameter: 40745b7e41feSStefano Zampini . a - pointer to the device data 40755b7e41feSStefano Zampini 40765b7e41feSStefano Zampini Level: developer 40775b7e41feSStefano Zampini 40785b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 40795b7e41feSStefano Zampini 40805b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 40815b7e41feSStefano Zampini @*/ 4082ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4083ed502f03SStefano Zampini { 4084ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4085ed502f03SStefano Zampini CsrMatrix *csr; 4086ed502f03SStefano Zampini PetscErrorCode ierr; 4087ed502f03SStefano Zampini 4088ed502f03SStefano Zampini PetscFunctionBegin; 4089ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4090ed502f03SStefano Zampini PetscValidPointer(a,2); 4091ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4092ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4093ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 409433c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4095ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4096ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4097ed502f03SStefano Zampini *a = csr->values->data().get(); 4098ed502f03SStefano Zampini PetscFunctionReturn(0); 4099ed502f03SStefano Zampini } 4100ed502f03SStefano Zampini 41015b7e41feSStefano Zampini /*@C 41025b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 41035b7e41feSStefano Zampini 41045b7e41feSStefano Zampini Not Collective 41055b7e41feSStefano Zampini 41065b7e41feSStefano Zampini Input Parameter: 41075b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41085b7e41feSStefano Zampini 41095b7e41feSStefano Zampini Output Parameter: 41105b7e41feSStefano Zampini . a - pointer to the device data 41115b7e41feSStefano Zampini 41125b7e41feSStefano Zampini Level: developer 41135b7e41feSStefano Zampini 41145b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 41155b7e41feSStefano Zampini @*/ 4116ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4117ed502f03SStefano Zampini { 4118ed502f03SStefano Zampini PetscFunctionBegin; 4119ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4120ed502f03SStefano Zampini PetscValidPointer(a,2); 4121ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4122ed502f03SStefano Zampini *a = NULL; 4123ed502f03SStefano Zampini PetscFunctionReturn(0); 4124ed502f03SStefano Zampini } 4125ed502f03SStefano Zampini 41265b7e41feSStefano Zampini /*@C 41275b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41285b7e41feSStefano Zampini 41295b7e41feSStefano Zampini Not Collective 41305b7e41feSStefano Zampini 41315b7e41feSStefano Zampini Input Parameter: 41325b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41335b7e41feSStefano Zampini 41345b7e41feSStefano Zampini Output Parameter: 41355b7e41feSStefano Zampini . a - pointer to the device data 41365b7e41feSStefano Zampini 41375b7e41feSStefano Zampini Level: developer 41385b7e41feSStefano Zampini 41395b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41405b7e41feSStefano Zampini 41415b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 41425b7e41feSStefano Zampini @*/ 4143039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4144039c6fbaSStefano Zampini { 4145039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4146039c6fbaSStefano Zampini CsrMatrix *csr; 4147039c6fbaSStefano Zampini PetscErrorCode ierr; 4148039c6fbaSStefano Zampini 4149039c6fbaSStefano Zampini PetscFunctionBegin; 4150039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4151039c6fbaSStefano Zampini PetscValidPointer(a,2); 4152039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4153039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4154039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 415533c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4156039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4157039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4158039c6fbaSStefano Zampini *a = csr->values->data().get(); 4159039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4160a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4161039c6fbaSStefano Zampini PetscFunctionReturn(0); 4162039c6fbaSStefano Zampini } 41635b7e41feSStefano Zampini /*@C 41645b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4165039c6fbaSStefano Zampini 41665b7e41feSStefano Zampini Not Collective 41675b7e41feSStefano Zampini 41685b7e41feSStefano Zampini Input Parameter: 41695b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41705b7e41feSStefano Zampini 41715b7e41feSStefano Zampini Output Parameter: 41725b7e41feSStefano Zampini . a - pointer to the device data 41735b7e41feSStefano Zampini 41745b7e41feSStefano Zampini Level: developer 41755b7e41feSStefano Zampini 41765b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 41775b7e41feSStefano Zampini @*/ 4178039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4179039c6fbaSStefano Zampini { 4180039c6fbaSStefano Zampini PetscErrorCode ierr; 4181039c6fbaSStefano Zampini 4182039c6fbaSStefano Zampini PetscFunctionBegin; 4183039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4184039c6fbaSStefano Zampini PetscValidPointer(a,2); 4185039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4186039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4187039c6fbaSStefano Zampini *a = NULL; 4188039c6fbaSStefano Zampini PetscFunctionReturn(0); 4189039c6fbaSStefano Zampini } 4190039c6fbaSStefano Zampini 41915b7e41feSStefano Zampini /*@C 41925b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41935b7e41feSStefano Zampini 41945b7e41feSStefano Zampini Not Collective 41955b7e41feSStefano Zampini 41965b7e41feSStefano Zampini Input Parameter: 41975b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41985b7e41feSStefano Zampini 41995b7e41feSStefano Zampini Output Parameter: 42005b7e41feSStefano Zampini . a - pointer to the device data 42015b7e41feSStefano Zampini 42025b7e41feSStefano Zampini Level: developer 42035b7e41feSStefano Zampini 42045b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 42055b7e41feSStefano Zampini 42065b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 42075b7e41feSStefano Zampini @*/ 4208ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4209ed502f03SStefano Zampini { 4210ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4211ed502f03SStefano Zampini CsrMatrix *csr; 4212a49f1ed0SStefano Zampini PetscErrorCode ierr; 4213ed502f03SStefano Zampini 4214ed502f03SStefano Zampini PetscFunctionBegin; 4215ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4216ed502f03SStefano Zampini PetscValidPointer(a,2); 4217ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4218ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 421933c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4220ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4221ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4222ed502f03SStefano Zampini *a = csr->values->data().get(); 4223039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4224a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4225ed502f03SStefano Zampini PetscFunctionReturn(0); 4226ed502f03SStefano Zampini } 4227ed502f03SStefano Zampini 42285b7e41feSStefano Zampini /*@C 42295b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 42305b7e41feSStefano Zampini 42315b7e41feSStefano Zampini Not Collective 42325b7e41feSStefano Zampini 42335b7e41feSStefano Zampini Input Parameter: 42345b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42355b7e41feSStefano Zampini 42365b7e41feSStefano Zampini Output Parameter: 42375b7e41feSStefano Zampini . a - pointer to the device data 42385b7e41feSStefano Zampini 42395b7e41feSStefano Zampini Level: developer 42405b7e41feSStefano Zampini 42415b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 42425b7e41feSStefano Zampini @*/ 4243ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4244ed502f03SStefano Zampini { 4245ed502f03SStefano Zampini PetscErrorCode ierr; 4246ed502f03SStefano Zampini 4247ed502f03SStefano Zampini PetscFunctionBegin; 4248ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4249ed502f03SStefano Zampini PetscValidPointer(a,2); 4250ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4251ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4252ed502f03SStefano Zampini *a = NULL; 4253ed502f03SStefano Zampini PetscFunctionReturn(0); 4254ed502f03SStefano Zampini } 4255ed502f03SStefano Zampini 4256ed502f03SStefano Zampini struct IJCompare4 4257ed502f03SStefano Zampini { 4258ed502f03SStefano Zampini __host__ __device__ 42592ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4260ed502f03SStefano Zampini { 4261ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4262ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4263ed502f03SStefano Zampini return false; 4264ed502f03SStefano Zampini } 4265ed502f03SStefano Zampini }; 4266ed502f03SStefano Zampini 42678909a122SStefano Zampini struct Shift 42688909a122SStefano Zampini { 4269ed502f03SStefano Zampini int _shift; 4270ed502f03SStefano Zampini 4271ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4272ed502f03SStefano Zampini __host__ __device__ 4273ed502f03SStefano Zampini inline int operator() (const int &c) 4274ed502f03SStefano Zampini { 4275ed502f03SStefano Zampini return c + _shift; 4276ed502f03SStefano Zampini } 4277ed502f03SStefano Zampini }; 4278ed502f03SStefano Zampini 4279ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4280ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4281ed502f03SStefano Zampini { 4282ed502f03SStefano Zampini PetscErrorCode ierr; 4283ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4284ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4285ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4286ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4287ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4288ed502f03SStefano Zampini cusparseStatus_t stat; 4289ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4290ed502f03SStefano Zampini cudaError_t cerr; 4291ed502f03SStefano Zampini 4292ed502f03SStefano Zampini PetscFunctionBegin; 4293ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4294ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4295ed502f03SStefano Zampini PetscValidPointer(C,4); 4296ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4297ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4298ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4299ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4300ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4301ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4302ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4303ed502f03SStefano Zampini m = A->rmap->n; 4304ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 4305ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4306ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4307ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4308ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4309ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4310ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4311ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4312ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4313ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4314ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4315ed502f03SStefano Zampini c->compressedrow.i = NULL; 4316ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4317ed502f03SStefano Zampini Ccusp->workVector = NULL; 4318ed502f03SStefano Zampini Ccusp->nrows = m; 4319ed502f03SStefano Zampini Ccusp->mat = Cmat; 4320ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4321ed502f03SStefano Zampini Ccsr->num_rows = m; 4322ed502f03SStefano Zampini Ccsr->num_cols = n; 4323ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4324ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4325ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4326ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4327ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4328ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4329ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4330ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4331ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4332ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4333ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4334ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4335ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4336ed502f03SStefano Zampini 4337ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4338ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4339ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4340ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4341ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4342ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4343ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4344ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4345ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4346ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4347ed502f03SStefano Zampini if (c->nz) { 43482ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 43492ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 43502ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 43512ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 43522ed87e7eSStefano Zampini 4353ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4354ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4355ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4356ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4357ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4358ed502f03SStefano Zampini } 43592ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 43602ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4361ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4362ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4363ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4364ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4365ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4366ed502f03SStefano Zampini } 43672ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 43682ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 4369ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 43702ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 43712ed87e7eSStefano Zampini Aroff->data().get(), 43722ed87e7eSStefano Zampini Annz, 43732ed87e7eSStefano Zampini m, 43742ed87e7eSStefano Zampini Acoo->data().get(), 43752ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4376ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 43772ed87e7eSStefano Zampini Broff->data().get(), 4378ed502f03SStefano Zampini Bnnz, 4379ed502f03SStefano Zampini m, 43802ed87e7eSStefano Zampini Bcoo->data().get(), 4381ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 43822ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 43832ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 43842ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 43858909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4386ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4387ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 43888909a122SStefano Zampini #else 43898909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 43908909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 43918909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 43928909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 43938909a122SStefano Zampini #endif 43942ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 43952ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 43962ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 43972ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 43982ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 43992ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4400ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4401ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4402ed502f03SStefano Zampini thrust::advance(p2,Annz); 44032ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 44048909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 44058909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 44068909a122SStefano Zampini #endif 44072ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 44082ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 44092ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 44102ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 44112ed87e7eSStefano Zampini #else 44122ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 44132ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 44142ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 44152ed87e7eSStefano Zampini #endif 4416ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 44172ed87e7eSStefano Zampini Ccoo->data().get(), 4418ed502f03SStefano Zampini c->nz, 4419ed502f03SStefano Zampini m, 4420ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4421ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4422ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 44232ed87e7eSStefano Zampini delete wPerm; 44242ed87e7eSStefano Zampini delete Acoo; 44252ed87e7eSStefano Zampini delete Bcoo; 44262ed87e7eSStefano Zampini delete Ccoo; 4427ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4428ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4429ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4430ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4431ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4432ed502f03SStefano Zampini #endif 44331a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 44343606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 44353606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4436ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4437ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4438ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4439ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4440ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4441ed502f03SStefano Zampini 44421a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 44431a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4444a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4445ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4446ed502f03SStefano Zampini CmatT->mat = CcsrT; 4447ed502f03SStefano Zampini CcsrT->num_rows = n; 4448ed502f03SStefano Zampini CcsrT->num_cols = m; 4449ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4450ed502f03SStefano Zampini 4451ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4452ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4453ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4454ed502f03SStefano Zampini 4455ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4456ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4457ed502f03SStefano Zampini if (AT) { 4458ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4459ed502f03SStefano Zampini thrust::advance(rT,-1); 4460ed502f03SStefano Zampini } 4461ed502f03SStefano Zampini if (BT) { 4462ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4463ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4464ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4465ed502f03SStefano Zampini } 4466ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4467ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4468ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4469ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4470ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4471ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4472ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4473ed502f03SStefano Zampini 4474ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4475ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4476ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4477ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4478ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4479ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4480ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4481ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4482ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4483ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4484ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4485ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4486ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4487ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4488ed502f03SStefano Zampini #endif 4489ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4490ed502f03SStefano Zampini } 4491ed502f03SStefano Zampini } 4492ed502f03SStefano Zampini 4493ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4494ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4495ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4496ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4497ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4498ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4499ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4500ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4501ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4502ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4503ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4504ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4505ed502f03SStefano Zampini } else { 4506ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4507ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4508ed502f03SStefano Zampini } 4509ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4510ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4511ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4512ed502f03SStefano Zampini c->maxnz = c->nz; 4513ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4514ed502f03SStefano Zampini c->rmax = 0; 4515ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4516ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4517ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4518ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4519ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4520ed502f03SStefano Zampini } 4521ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4522ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4523ed502f03SStefano Zampini (*C)->nonzerostate++; 4524ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4525ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4526ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4527ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4528ed502f03SStefano Zampini } else { 4529ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4530ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4531ed502f03SStefano Zampini if (c->nz) { 4532ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4533ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4534ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4535ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4536ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4537ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4538ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4539ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4540ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4541ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4542ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4543ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4544ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4545ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4546ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4547ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4548ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4549ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4550ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4551ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4552ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4553ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4554ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4555ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4556ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4557ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4558ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4559ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4560ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4561a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 45621a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4563ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4564ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4565ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4566ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4567ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4568ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4569ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4570ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 45711a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4572ed502f03SStefano Zampini } 4573ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4574ed502f03SStefano Zampini } 4575ed502f03SStefano Zampini } 4576ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4577ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4578ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4579ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4580ed502f03SStefano Zampini PetscFunctionReturn(0); 4581ed502f03SStefano Zampini } 4582c215019aSStefano Zampini 4583c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4584c215019aSStefano Zampini { 4585c215019aSStefano Zampini PetscErrorCode ierr; 4586c215019aSStefano Zampini bool dmem; 4587c215019aSStefano Zampini const PetscScalar *av; 4588c215019aSStefano Zampini cudaError_t cerr; 4589c215019aSStefano Zampini 4590c215019aSStefano Zampini PetscFunctionBegin; 4591c215019aSStefano Zampini dmem = isCudaMem(v); 4592c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4593c215019aSStefano Zampini if (n && idx) { 4594c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4595c215019aSStefano Zampini widx.assign(idx,idx+n); 4596c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4597c215019aSStefano Zampini 4598c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4599c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4600c215019aSStefano Zampini if (dmem) { 4601c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4602c215019aSStefano Zampini } else { 4603c215019aSStefano Zampini w = new THRUSTARRAY(n); 4604c215019aSStefano Zampini dv = w->data(); 4605c215019aSStefano Zampini } 4606c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4607c215019aSStefano Zampini 4608c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4609c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4610c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4611c215019aSStefano Zampini if (w) { 4612c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4613c215019aSStefano Zampini } 4614c215019aSStefano Zampini delete w; 4615c215019aSStefano Zampini } else { 4616c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4617c215019aSStefano Zampini } 4618c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4619c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4620c215019aSStefano Zampini PetscFunctionReturn(0); 4621c215019aSStefano Zampini } 4622