19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 16e8d2b73aSMark Adams 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 769ae82921SPaul Mullowney 777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 827f756511SDominic Meiser 83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 8657181aedSStefano Zampini 877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 897e8381f9SStefano Zampini 90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91c215019aSStefano Zampini 92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93b06137fdSPaul Mullowney { 94b06137fdSPaul Mullowney cusparseStatus_t stat; 95b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96b06137fdSPaul Mullowney 97b06137fdSPaul Mullowney PetscFunctionBegin; 98d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10057d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101b06137fdSPaul Mullowney PetscFunctionReturn(0); 102b06137fdSPaul Mullowney } 103b06137fdSPaul Mullowney 104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105b06137fdSPaul Mullowney { 106b06137fdSPaul Mullowney cusparseStatus_t stat; 107b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108b06137fdSPaul Mullowney 109b06137fdSPaul Mullowney PetscFunctionBegin; 110d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1116b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11216a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11357d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11416a2e217SAlejandro Lamas Daviña } 115b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1166b1cf21dSAlejandro Lamas Daviña } 11757d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118b06137fdSPaul Mullowney PetscFunctionReturn(0); 119b06137fdSPaul Mullowney } 120b06137fdSPaul Mullowney 121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122b06137fdSPaul Mullowney { 123b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1247e8381f9SStefano Zampini PetscBool flg; 1257e8381f9SStefano Zampini PetscErrorCode ierr; 126ccdfe979SStefano Zampini 127b06137fdSPaul Mullowney PetscFunctionBegin; 1287e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1297e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 131b06137fdSPaul Mullowney PetscFunctionReturn(0); 132b06137fdSPaul Mullowney } 133b06137fdSPaul Mullowney 134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1359ae82921SPaul Mullowney { 1369ae82921SPaul Mullowney PetscFunctionBegin; 1379ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1389ae82921SPaul Mullowney PetscFunctionReturn(0); 1399ae82921SPaul Mullowney } 1409ae82921SPaul Mullowney 141c708e6cdSJed Brown /*MC 142087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 148c708e6cdSJed Brown 1499ae82921SPaul Mullowney Level: beginner 150c708e6cdSJed Brown 1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152c708e6cdSJed Brown M*/ 1539ae82921SPaul Mullowney 15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1559ae82921SPaul Mullowney { 1569ae82921SPaul Mullowney PetscErrorCode ierr; 157bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1589ae82921SPaul Mullowney 1599ae82921SPaul Mullowney PetscFunctionBegin; 160bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1622c7c0729SBarry Smith (*B)->factortype = ftype; 1639ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1642205254eSKarl Rupp 165087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16633d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1679ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1689ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1694ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1704ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 1714ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1754ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 1764ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 1779ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178bc3f50f2SPaul Mullowney 179fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1804ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1813ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1829ae82921SPaul Mullowney PetscFunctionReturn(0); 1839ae82921SPaul Mullowney } 1849ae82921SPaul Mullowney 185bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186ca45077fSPaul Mullowney { 187aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1886e111a19SKarl Rupp 189ca45077fSPaul Mullowney PetscFunctionBegin; 190ca45077fSPaul Mullowney switch (op) { 191e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 192aa372e3fSPaul Mullowney cusparsestruct->format = format; 193ca45077fSPaul Mullowney break; 194e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 195aa372e3fSPaul Mullowney cusparsestruct->format = format; 196ca45077fSPaul Mullowney break; 197ca45077fSPaul Mullowney default: 19836d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199ca45077fSPaul Mullowney } 200ca45077fSPaul Mullowney PetscFunctionReturn(0); 201ca45077fSPaul Mullowney } 2029ae82921SPaul Mullowney 203e057df02SPaul Mullowney /*@ 204e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 206aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 207e057df02SPaul Mullowney Not Collective 208e057df02SPaul Mullowney 209e057df02SPaul Mullowney Input Parameters: 2108468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 21136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2122692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213e057df02SPaul Mullowney 214e057df02SPaul Mullowney Output Parameter: 215e057df02SPaul Mullowney 216e057df02SPaul Mullowney Level: intermediate 217e057df02SPaul Mullowney 2188468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219e057df02SPaul Mullowney @*/ 220e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221e057df02SPaul Mullowney { 222e057df02SPaul Mullowney PetscErrorCode ierr; 2236e111a19SKarl Rupp 224e057df02SPaul Mullowney PetscFunctionBegin; 225e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227e057df02SPaul Mullowney PetscFunctionReturn(0); 228e057df02SPaul Mullowney } 229e057df02SPaul Mullowney 2301a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231e6e9a74fSStefano Zampini { 232e6e9a74fSStefano Zampini PetscErrorCode ierr; 233e6e9a74fSStefano Zampini 234e6e9a74fSStefano Zampini PetscFunctionBegin; 2351a2c6b5cSJunchao Zhang switch (op) { 2361a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2371a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2381a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2391a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2401a2c6b5cSJunchao Zhang break; 2411a2c6b5cSJunchao Zhang default: 2421a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2431a2c6b5cSJunchao Zhang break; 244e6e9a74fSStefano Zampini } 245e6e9a74fSStefano Zampini PetscFunctionReturn(0); 246e6e9a74fSStefano Zampini } 247e6e9a74fSStefano Zampini 248bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249bddcd29dSMark Adams 250bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251bddcd29dSMark Adams { 252bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 254bddcd29dSMark Adams PetscBool row_identity,col_identity; 255bddcd29dSMark Adams PetscErrorCode ierr; 256bddcd29dSMark Adams 257bddcd29dSMark Adams PetscFunctionBegin; 258bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 261bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 262bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264bddcd29dSMark Adams if (row_identity && col_identity) { 265bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267bddcd29dSMark Adams B->ops->matsolve = NULL; 268bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 269bddcd29dSMark Adams } else { 270bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272bddcd29dSMark Adams B->ops->matsolve = NULL; 273bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 274bddcd29dSMark Adams } 275bddcd29dSMark Adams 276bddcd29dSMark Adams /* get the triangular factors */ 277bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278bddcd29dSMark Adams PetscFunctionReturn(0); 279bddcd29dSMark Adams } 280bddcd29dSMark Adams 2814416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2829ae82921SPaul Mullowney { 2839ae82921SPaul Mullowney PetscErrorCode ierr; 284e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2859ae82921SPaul Mullowney PetscBool flg; 286a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2876e111a19SKarl Rupp 2889ae82921SPaul Mullowney PetscFunctionBegin; 289e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2909ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 291e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294afb2bd1cSJunchao Zhang 2954c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 303afb2bd1cSJunchao Zhang 304afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 305afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 306afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 307afb2bd1cSJunchao Zhang 308afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 309afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 310afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 311afb2bd1cSJunchao Zhang #endif 3124c87dfd4SPaul Mullowney } 3130af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3149ae82921SPaul Mullowney PetscFunctionReturn(0); 3159ae82921SPaul Mullowney } 3169ae82921SPaul Mullowney 3176fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3189ae82921SPaul Mullowney { 319da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3209ae82921SPaul Mullowney PetscErrorCode ierr; 3219ae82921SPaul Mullowney 3229ae82921SPaul Mullowney PetscFunctionBegin; 323da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3249ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3259ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3269ae82921SPaul Mullowney PetscFunctionReturn(0); 3279ae82921SPaul Mullowney } 3289ae82921SPaul Mullowney 3296fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3309ae82921SPaul Mullowney { 331da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3329ae82921SPaul Mullowney PetscErrorCode ierr; 3339ae82921SPaul Mullowney 3349ae82921SPaul Mullowney PetscFunctionBegin; 335da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3369ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3379ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3389ae82921SPaul Mullowney PetscFunctionReturn(0); 3399ae82921SPaul Mullowney } 3409ae82921SPaul Mullowney 341087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 342087f3262SPaul Mullowney { 343da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344087f3262SPaul Mullowney PetscErrorCode ierr; 345087f3262SPaul Mullowney 346087f3262SPaul Mullowney PetscFunctionBegin; 347da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 348087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 349087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 350087f3262SPaul Mullowney PetscFunctionReturn(0); 351087f3262SPaul Mullowney } 352087f3262SPaul Mullowney 353087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 354087f3262SPaul Mullowney { 355da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 356087f3262SPaul Mullowney PetscErrorCode ierr; 357087f3262SPaul Mullowney 358087f3262SPaul Mullowney PetscFunctionBegin; 359da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 360087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 361087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 362087f3262SPaul Mullowney PetscFunctionReturn(0); 363087f3262SPaul Mullowney } 364087f3262SPaul Mullowney 365087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3669ae82921SPaul Mullowney { 3679ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3689ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3699ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3719ae82921SPaul Mullowney cusparseStatus_t stat; 3729ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3739ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3749ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3759ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 376b175d8bbSPaul Mullowney PetscErrorCode ierr; 37757d48284SJunchao Zhang cudaError_t cerr; 3789ae82921SPaul Mullowney 3799ae82921SPaul Mullowney PetscFunctionBegin; 380cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 381c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3829ae82921SPaul Mullowney try { 3839ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3849ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 385da79fbbcSStefano Zampini if (!loTriFactor) { 3862cbc15d9SMark PetscScalar *AALo; 3872cbc15d9SMark 3882cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3899ae82921SPaul Mullowney 3909ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 39157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 39257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3959ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3969ae82921SPaul Mullowney AiLo[n] = nzLower; 3979ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3989ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3999ae82921SPaul Mullowney v = aa; 4009ae82921SPaul Mullowney vi = aj; 4019ae82921SPaul Mullowney offset = 1; 4029ae82921SPaul Mullowney rowOffset= 1; 4039ae82921SPaul Mullowney for (i=1; i<n; i++) { 4049ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 405e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4069ae82921SPaul Mullowney AiLo[i] = rowOffset; 4079ae82921SPaul Mullowney rowOffset += nz+1; 4089ae82921SPaul Mullowney 409580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 410580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4119ae82921SPaul Mullowney 4129ae82921SPaul Mullowney offset += nz; 4139ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4149ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4159ae82921SPaul Mullowney offset += 1; 4169ae82921SPaul Mullowney 4179ae82921SPaul Mullowney v += nz; 4189ae82921SPaul Mullowney vi += nz; 4199ae82921SPaul Mullowney } 4202205254eSKarl Rupp 421aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 422da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 423da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 424aa372e3fSPaul Mullowney /* Create the matrix description */ 42557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 42657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4271b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 428afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 429afb2bd1cSJunchao Zhang #else 43057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 431afb2bd1cSJunchao Zhang #endif 43257d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 43357d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 434aa372e3fSPaul Mullowney 435aa372e3fSPaul Mullowney /* set the operation */ 436aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 437aa372e3fSPaul Mullowney 438aa372e3fSPaul Mullowney /* set the matrix */ 439aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 440aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 441aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 442aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 443aa372e3fSPaul Mullowney 444aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 445aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 446aa372e3fSPaul Mullowney 447aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 448aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 449aa372e3fSPaul Mullowney 450aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 451aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 452aa372e3fSPaul Mullowney 453afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 454da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 455afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 457afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 458afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 459afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 460afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 461afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 462afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 463afb2bd1cSJunchao Zhang #endif 464afb2bd1cSJunchao Zhang 465aa372e3fSPaul Mullowney /* perform the solve analysis */ 466aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 467aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 468aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 469d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 4701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 471d49cd2b7SBarry Smith loTriFactor->solveInfo, 472d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 473d49cd2b7SBarry Smith #else 474d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 475afb2bd1cSJunchao Zhang #endif 476da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 477da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 478aa372e3fSPaul Mullowney 479da79fbbcSStefano Zampini /* assign the pointer */ 480aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4812cbc15d9SMark loTriFactor->AA_h = AALo; 48257d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 48357d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4844863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 485da79fbbcSStefano Zampini } else { /* update values only */ 4862cbc15d9SMark if (!loTriFactor->AA_h) { 4872cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4882cbc15d9SMark } 489da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4902cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 491da79fbbcSStefano Zampini v = aa; 492da79fbbcSStefano Zampini vi = aj; 493da79fbbcSStefano Zampini offset = 1; 494da79fbbcSStefano Zampini for (i=1; i<n; i++) { 495da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4962cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 497da79fbbcSStefano Zampini offset += nz; 4982cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 499da79fbbcSStefano Zampini offset += 1; 500da79fbbcSStefano Zampini v += nz; 501da79fbbcSStefano Zampini } 5022cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 503da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 504da79fbbcSStefano Zampini } 5059ae82921SPaul Mullowney } catch(char *ex) { 5069ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5079ae82921SPaul Mullowney } 5089ae82921SPaul Mullowney } 5099ae82921SPaul Mullowney PetscFunctionReturn(0); 5109ae82921SPaul Mullowney } 5119ae82921SPaul Mullowney 512087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5139ae82921SPaul Mullowney { 5149ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5159ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5169ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 517aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5189ae82921SPaul Mullowney cusparseStatus_t stat; 5199ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5209ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5219ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5229ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5239ae82921SPaul Mullowney PetscErrorCode ierr; 52457d48284SJunchao Zhang cudaError_t cerr; 5259ae82921SPaul Mullowney 5269ae82921SPaul Mullowney PetscFunctionBegin; 527cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 528c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5299ae82921SPaul Mullowney try { 5309ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5319ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 532da79fbbcSStefano Zampini if (!upTriFactor) { 5332cbc15d9SMark PetscScalar *AAUp; 5342cbc15d9SMark 5352cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5362cbc15d9SMark 5379ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 53857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 53957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5409ae82921SPaul Mullowney 5419ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5429ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5439ae82921SPaul Mullowney AiUp[n]=nzUpper; 5449ae82921SPaul Mullowney offset = nzUpper; 5459ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5469ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5479ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5489ae82921SPaul Mullowney 549e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5509ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5519ae82921SPaul Mullowney 552e057df02SPaul Mullowney /* decrement the offset */ 5539ae82921SPaul Mullowney offset -= (nz+1); 5549ae82921SPaul Mullowney 555e057df02SPaul Mullowney /* first, set the diagonal elements */ 5569ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 55709f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5589ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5599ae82921SPaul Mullowney 560580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 561580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5629ae82921SPaul Mullowney } 5632205254eSKarl Rupp 564aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 565da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 566da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5672205254eSKarl Rupp 568aa372e3fSPaul Mullowney /* Create the matrix description */ 56957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 57057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 572afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 573afb2bd1cSJunchao Zhang #else 57457d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 575afb2bd1cSJunchao Zhang #endif 57657d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 57757d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 578aa372e3fSPaul Mullowney 579aa372e3fSPaul Mullowney /* set the operation */ 580aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 581aa372e3fSPaul Mullowney 582aa372e3fSPaul Mullowney /* set the matrix */ 583aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 584aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 585aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 586aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 587aa372e3fSPaul Mullowney 588aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 589aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 590aa372e3fSPaul Mullowney 591aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 592aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 593aa372e3fSPaul Mullowney 594aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 595aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 596aa372e3fSPaul Mullowney 597afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 598da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 599afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 6001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 601afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 602afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 603afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 604afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 605afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 606afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 607afb2bd1cSJunchao Zhang #endif 608afb2bd1cSJunchao Zhang 609aa372e3fSPaul Mullowney /* perform the solve analysis */ 610aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 611aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 612aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 613d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 615d49cd2b7SBarry Smith upTriFactor->solveInfo, 616d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 617d49cd2b7SBarry Smith #else 618d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 619afb2bd1cSJunchao Zhang #endif 620da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 621da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 622aa372e3fSPaul Mullowney 623da79fbbcSStefano Zampini /* assign the pointer */ 624aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6252cbc15d9SMark upTriFactor->AA_h = AAUp; 62657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 62757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6284863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 629da79fbbcSStefano Zampini } else { 6302cbc15d9SMark if (!upTriFactor->AA_h) { 6312cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6322cbc15d9SMark } 633da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 634da79fbbcSStefano Zampini offset = nzUpper; 635da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 636da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 637da79fbbcSStefano Zampini 638da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 639da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 640da79fbbcSStefano Zampini 641da79fbbcSStefano Zampini /* decrement the offset */ 642da79fbbcSStefano Zampini offset -= (nz+1); 643da79fbbcSStefano Zampini 644da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6452cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6462cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 647da79fbbcSStefano Zampini } 6482cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 649da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 650da79fbbcSStefano Zampini } 6519ae82921SPaul Mullowney } catch(char *ex) { 6529ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6539ae82921SPaul Mullowney } 6549ae82921SPaul Mullowney } 6559ae82921SPaul Mullowney PetscFunctionReturn(0); 6569ae82921SPaul Mullowney } 6579ae82921SPaul Mullowney 658087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6599ae82921SPaul Mullowney { 6609ae82921SPaul Mullowney PetscErrorCode ierr; 6619ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6629ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6639ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6649ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6659ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6669ae82921SPaul Mullowney 6679ae82921SPaul Mullowney PetscFunctionBegin; 668da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 669087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 670087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6712205254eSKarl Rupp 672da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 673aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6749ae82921SPaul Mullowney 675c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 676e057df02SPaul Mullowney /* lower triangular indices */ 6779ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 678da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 679da79fbbcSStefano Zampini const PetscInt *r; 680da79fbbcSStefano Zampini 681da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 682aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 683aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6849ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 685da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 686da79fbbcSStefano Zampini } 6879ae82921SPaul Mullowney 688e057df02SPaul Mullowney /* upper triangular indices */ 6899ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 691da79fbbcSStefano Zampini const PetscInt *c; 692da79fbbcSStefano Zampini 693da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 694aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 695aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6969ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 697da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 698da79fbbcSStefano Zampini } 6999ae82921SPaul Mullowney PetscFunctionReturn(0); 7009ae82921SPaul Mullowney } 7019ae82921SPaul Mullowney 702087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 703087f3262SPaul Mullowney { 704087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 705087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708087f3262SPaul Mullowney cusparseStatus_t stat; 709087f3262SPaul Mullowney PetscErrorCode ierr; 71057d48284SJunchao Zhang cudaError_t cerr; 711087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 712087f3262SPaul Mullowney PetscScalar *AAUp; 713087f3262SPaul Mullowney PetscScalar *AALo; 714087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 715087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 716087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 717087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 718087f3262SPaul Mullowney 719087f3262SPaul Mullowney PetscFunctionBegin; 720cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 721c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 722087f3262SPaul Mullowney try { 723da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 724da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 725da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 726087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 72757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 72857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 729087f3262SPaul Mullowney 730087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 731087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 732087f3262SPaul Mullowney AiUp[n]=nzUpper; 733087f3262SPaul Mullowney offset = 0; 734087f3262SPaul Mullowney for (i=0; i<n; i++) { 735087f3262SPaul Mullowney /* set the pointers */ 736087f3262SPaul Mullowney v = aa + ai[i]; 737087f3262SPaul Mullowney vj = aj + ai[i]; 738087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 739087f3262SPaul Mullowney 740087f3262SPaul Mullowney /* first, set the diagonal elements */ 741087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 74209f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 743087f3262SPaul Mullowney AiUp[i] = offset; 74409f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 745087f3262SPaul Mullowney 746087f3262SPaul Mullowney offset+=1; 747087f3262SPaul Mullowney if (nz>0) { 748f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 749580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 750087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 751087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 752087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 753087f3262SPaul Mullowney } 754087f3262SPaul Mullowney offset+=nz; 755087f3262SPaul Mullowney } 756087f3262SPaul Mullowney } 757087f3262SPaul Mullowney 758aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 759da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 760da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 761087f3262SPaul Mullowney 762aa372e3fSPaul Mullowney /* Create the matrix description */ 76357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 76457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 766afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 767afb2bd1cSJunchao Zhang #else 76857d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 769afb2bd1cSJunchao Zhang #endif 77057d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 77157d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 772087f3262SPaul Mullowney 773aa372e3fSPaul Mullowney /* set the matrix */ 774aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 775aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 776aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 777aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 778aa372e3fSPaul Mullowney 779aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 780aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 781aa372e3fSPaul Mullowney 782aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 783aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 784aa372e3fSPaul Mullowney 785aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 786aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 787aa372e3fSPaul Mullowney 788afb2bd1cSJunchao Zhang /* set the operation */ 789afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 790afb2bd1cSJunchao Zhang 791afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 792da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 793afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 795afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 796afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 797afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 798afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 799afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 800afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 801afb2bd1cSJunchao Zhang #endif 802afb2bd1cSJunchao Zhang 803aa372e3fSPaul Mullowney /* perform the solve analysis */ 804aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 805aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 806aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 807d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 809d49cd2b7SBarry Smith upTriFactor->solveInfo, 810d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 811d49cd2b7SBarry Smith #else 812d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 813afb2bd1cSJunchao Zhang #endif 814da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 815da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816aa372e3fSPaul Mullowney 817da79fbbcSStefano Zampini /* assign the pointer */ 818aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819aa372e3fSPaul Mullowney 820aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 821da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823aa372e3fSPaul Mullowney 824aa372e3fSPaul Mullowney /* Create the matrix description */ 82557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 82657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8271b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829afb2bd1cSJunchao Zhang #else 83057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831afb2bd1cSJunchao Zhang #endif 83257d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 83357d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834aa372e3fSPaul Mullowney 835aa372e3fSPaul Mullowney /* set the operation */ 836aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837aa372e3fSPaul Mullowney 838aa372e3fSPaul Mullowney /* set the matrix */ 839aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 840aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 841aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 842aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 843aa372e3fSPaul Mullowney 844aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846aa372e3fSPaul Mullowney 847aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849aa372e3fSPaul Mullowney 850aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852aa372e3fSPaul Mullowney 853afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 854da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863afb2bd1cSJunchao Zhang #endif 864afb2bd1cSJunchao Zhang 865aa372e3fSPaul Mullowney /* perform the solve analysis */ 866aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 8701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871d49cd2b7SBarry Smith loTriFactor->solveInfo, 872d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 873d49cd2b7SBarry Smith #else 874d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 875afb2bd1cSJunchao Zhang #endif 876da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 877da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 878aa372e3fSPaul Mullowney 879da79fbbcSStefano Zampini /* assign the pointer */ 880aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 881087f3262SPaul Mullowney 882da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 88357d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 88457d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 885da79fbbcSStefano Zampini } else { 886da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 887da79fbbcSStefano Zampini offset = 0; 888da79fbbcSStefano Zampini for (i=0; i<n; i++) { 889da79fbbcSStefano Zampini /* set the pointers */ 890da79fbbcSStefano Zampini v = aa + ai[i]; 891da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 892da79fbbcSStefano Zampini 893da79fbbcSStefano Zampini /* first, set the diagonal elements */ 894da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 895da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 896da79fbbcSStefano Zampini 897da79fbbcSStefano Zampini offset+=1; 898da79fbbcSStefano Zampini if (nz>0) { 899da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 900da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 901da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 902da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 903da79fbbcSStefano Zampini } 904da79fbbcSStefano Zampini offset+=nz; 905da79fbbcSStefano Zampini } 906da79fbbcSStefano Zampini } 907da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 909da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 910da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 911da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 912da79fbbcSStefano Zampini } 91357d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 91457d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 915087f3262SPaul Mullowney } catch(char *ex) { 916087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 917087f3262SPaul Mullowney } 918087f3262SPaul Mullowney } 919087f3262SPaul Mullowney PetscFunctionReturn(0); 920087f3262SPaul Mullowney } 921087f3262SPaul Mullowney 922087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9239ae82921SPaul Mullowney { 9249ae82921SPaul Mullowney PetscErrorCode ierr; 925087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 926087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 927087f3262SPaul Mullowney IS ip = a->row; 928087f3262SPaul Mullowney PetscBool perm_identity; 929087f3262SPaul Mullowney PetscInt n = A->rmap->n; 930087f3262SPaul Mullowney 931087f3262SPaul Mullowney PetscFunctionBegin; 932da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 933087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 934da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 935aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 936aa372e3fSPaul Mullowney 937da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 938da79fbbcSStefano Zampini 939087f3262SPaul Mullowney /* lower triangular indices */ 940087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 941087f3262SPaul Mullowney if (!perm_identity) { 9424e4bbfaaSStefano Zampini IS iip; 943da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9444e4bbfaaSStefano Zampini 9454e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9464e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 947da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 948aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 949aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 950aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9514e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9524e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9534e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 954087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 955da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 956da79fbbcSStefano Zampini } 957087f3262SPaul Mullowney PetscFunctionReturn(0); 958087f3262SPaul Mullowney } 959087f3262SPaul Mullowney 960087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 961087f3262SPaul Mullowney { 962087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 963087f3262SPaul Mullowney IS ip = b->row; 964087f3262SPaul Mullowney PetscBool perm_identity; 965b175d8bbSPaul Mullowney PetscErrorCode ierr; 966087f3262SPaul Mullowney 967087f3262SPaul Mullowney PetscFunctionBegin; 96857181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 969087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 970ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 971087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 972087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 973087f3262SPaul Mullowney if (perm_identity) { 974087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 975087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9764e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9774e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 978087f3262SPaul Mullowney } else { 979087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 980087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9814e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9824e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 983087f3262SPaul Mullowney } 984087f3262SPaul Mullowney 985087f3262SPaul Mullowney /* get the triangular factors */ 986087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 987087f3262SPaul Mullowney PetscFunctionReturn(0); 988087f3262SPaul Mullowney } 9899ae82921SPaul Mullowney 990b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 991bda325fcSPaul Mullowney { 992bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 993aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 994aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 995da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 996da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 997bda325fcSPaul Mullowney cusparseStatus_t stat; 998aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 999aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1000aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1001aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10021b0a6780SStefano Zampini cudaError_t cerr; 1003da79fbbcSStefano Zampini PetscErrorCode ierr; 1004b175d8bbSPaul Mullowney 1005bda325fcSPaul Mullowney PetscFunctionBegin; 1006aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1007da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1008da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1009aa372e3fSPaul Mullowney 1010aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1011aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1012aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1013aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1014aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1015aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1016aa372e3fSPaul Mullowney 1017aa372e3fSPaul Mullowney /* Create the matrix description */ 101857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 101957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 102057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 102157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 102257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1023aa372e3fSPaul Mullowney 1024aa372e3fSPaul Mullowney /* set the operation */ 1025aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1026aa372e3fSPaul Mullowney 1027aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1028aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1029afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1030afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1031aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1032afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1033afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1034afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1035aa372e3fSPaul Mullowney 1036aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1039afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1040afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1041afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1042afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1043afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1044afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1046afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10471b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1048afb2bd1cSJunchao Zhang #endif 1049afb2bd1cSJunchao Zhang 1050da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1051aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1052aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1053aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1054aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1055aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1056aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1057afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1058afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1059afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1060d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1061afb2bd1cSJunchao Zhang #else 1062afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1063d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1064afb2bd1cSJunchao Zhang #endif 1065da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1066da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1067aa372e3fSPaul Mullowney 1068afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1069da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1070afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1072afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1074afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1076afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1077afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1078afb2bd1cSJunchao Zhang #endif 1079afb2bd1cSJunchao Zhang 1080afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1081aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1083afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1084d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 10851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1086d49cd2b7SBarry Smith loTriFactorT->solveInfo, 1087d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1088d49cd2b7SBarry Smith #else 1089d49cd2b7SBarry Smith loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1090afb2bd1cSJunchao Zhang #endif 1091da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1092da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1093aa372e3fSPaul Mullowney 1094da79fbbcSStefano Zampini /* assign the pointer */ 1095aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1096aa372e3fSPaul Mullowney 1097aa372e3fSPaul Mullowney /*********************************************/ 1098aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1099aa372e3fSPaul Mullowney /*********************************************/ 1100aa372e3fSPaul Mullowney 1101aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1102da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1103da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1104aa372e3fSPaul Mullowney 1105aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1106aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1107aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1108aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1109aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1110aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1111aa372e3fSPaul Mullowney 1112aa372e3fSPaul Mullowney /* Create the matrix description */ 111357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 111457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 111557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 111657d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 111757d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1118aa372e3fSPaul Mullowney 1119aa372e3fSPaul Mullowney /* set the operation */ 1120aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1121aa372e3fSPaul Mullowney 1122aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1123aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1124afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1125afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1126aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1127afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1128afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1129afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1130aa372e3fSPaul Mullowney 1131aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1132afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1133afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1134afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1135afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1136afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1137afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1138afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1139afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1140afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1141afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1142afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1143afb2bd1cSJunchao Zhang #endif 1144afb2bd1cSJunchao Zhang 1145da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1146aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1147aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1148aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1149aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1150aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1151aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1152afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1154afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1155d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1156afb2bd1cSJunchao Zhang #else 1157afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1158d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1159afb2bd1cSJunchao Zhang #endif 1160d49cd2b7SBarry Smith 1161da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1162da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1163aa372e3fSPaul Mullowney 1164afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1165da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1166afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1168afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1172afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1173afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1174afb2bd1cSJunchao Zhang #endif 1175afb2bd1cSJunchao Zhang 1176afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1177aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1178afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1179afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1180d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 11811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1182d49cd2b7SBarry Smith upTriFactorT->solveInfo, 1183d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1184d49cd2b7SBarry Smith #else 1185d49cd2b7SBarry Smith upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1186afb2bd1cSJunchao Zhang #endif 1187d49cd2b7SBarry Smith 1188da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1189da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1190aa372e3fSPaul Mullowney 1191da79fbbcSStefano Zampini /* assign the pointer */ 1192aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1193bda325fcSPaul Mullowney PetscFunctionReturn(0); 1194bda325fcSPaul Mullowney } 1195bda325fcSPaul Mullowney 1196a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1197a49f1ed0SStefano Zampini { 1198a49f1ed0SStefano Zampini __host__ __device__ 1199a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1200a49f1ed0SStefano Zampini { 1201a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1202a49f1ed0SStefano Zampini } 1203a49f1ed0SStefano Zampini }; 1204a49f1ed0SStefano Zampini 12051a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1206bda325fcSPaul Mullowney { 1207aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1209bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210bda325fcSPaul Mullowney cusparseStatus_t stat; 1211aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1212b06137fdSPaul Mullowney cudaError_t err; 121385ba7357SStefano Zampini PetscErrorCode ierr; 1214b175d8bbSPaul Mullowney 1215bda325fcSPaul Mullowney PetscFunctionBegin; 12161a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1217a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1218a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1219e8d2b73aSMark Adams if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1220a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1221e8d2b73aSMark Adams if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12221a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 122385ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224ee7b52eaSHong Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1226a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1227a49f1ed0SStefano Zampini } 1228a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1229aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 123057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1231aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 123257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 123357d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1234aa372e3fSPaul Mullowney 1235b06137fdSPaul Mullowney /* set alpha and beta */ 1236afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12377656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12387656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1239afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12407656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12417656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242b06137fdSPaul Mullowney 1243aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1244aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1245a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1246554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1247554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1248aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1249a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1250aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1251aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1252a3fdcf43SKarl Rupp 1253039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 125481902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1255afb2bd1cSJunchao Zhang 1256afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1257afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1258afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1259afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1260afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1261afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1262afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1263afb2bd1cSJunchao Zhang #endif 1264aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1265afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1266afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1267afb2bd1cSJunchao Zhang #else 1268aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 126951c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 127051c6d536SStefano Zampini /* First convert HYB to CSR */ 1271aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1272aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1273aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1274aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1275aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1276aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1277aa372e3fSPaul Mullowney 1278aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1279aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1280aa372e3fSPaul Mullowney temp->values->data().get(), 1281aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 128257d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1283aa372e3fSPaul Mullowney 1284aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1285aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1286aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1287aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1288aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1290aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1291aa372e3fSPaul Mullowney 1292aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1293aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1294aa372e3fSPaul Mullowney temp->values->data().get(), 1295aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1296aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1297aa372e3fSPaul Mullowney tempT->values->data().get(), 1298aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1299aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 130057d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1301aa372e3fSPaul Mullowney 1302aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1303aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 130457d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1305aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1306aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1307aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1308aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1309aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1310aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 131157d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1312aa372e3fSPaul Mullowney 1313aa372e3fSPaul Mullowney /* assign the pointer */ 1314aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13151a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1316aa372e3fSPaul Mullowney /* delete temporaries */ 1317aa372e3fSPaul Mullowney if (tempT) { 1318aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1319aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1320aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1321aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1322087f3262SPaul Mullowney } 1323aa372e3fSPaul Mullowney if (temp) { 1324aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1325aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1326aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1327aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1328aa372e3fSPaul Mullowney } 1329afb2bd1cSJunchao Zhang #endif 1330aa372e3fSPaul Mullowney } 1331a49f1ed0SStefano Zampini } 1332a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1333a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1334a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1335e8d2b73aSMark Adams if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1336e8d2b73aSMark Adams if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1337e8d2b73aSMark Adams if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1338e8d2b73aSMark Adams if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1339e8d2b73aSMark Adams if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1340e8d2b73aSMark Adams if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1341e8d2b73aSMark Adams if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1342e8d2b73aSMark Adams if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1343a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1344a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1345a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1346a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1347a49f1ed0SStefano Zampini } 1348a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1349a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1350a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1351a49f1ed0SStefano Zampini 1352a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1353a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354a49f1ed0SStefano Zampini void *csr2cscBuffer; 1355a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1356a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1357a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1358a49f1ed0SStefano Zampini matrix->values->data().get(), 1359a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1360a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1361a49f1ed0SStefano Zampini matrixT->values->data().get(), 1362a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1363a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1364a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1365a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1366a49f1ed0SStefano Zampini #endif 1367a49f1ed0SStefano Zampini 13681a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13691a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13701a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13711a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13721a2c6b5cSJunchao Zhang 13731a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13741a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13751a2c6b5cSJunchao Zhang */ 13761a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 13771a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 13781a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 13791a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 13801a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1381a49f1ed0SStefano Zampini matrixT->values->data().get(), 1382a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1383a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1384a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13851a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1386a49f1ed0SStefano Zampini #else 1387a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 13881a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1389a49f1ed0SStefano Zampini #endif 13901a2c6b5cSJunchao Zhang } else { 13911a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13921a2c6b5cSJunchao Zhang } 13931a2c6b5cSJunchao Zhang 1394a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1395a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1396a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1398a49f1ed0SStefano Zampini #endif 1399a49f1ed0SStefano Zampini } 1400a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1401a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1402a49f1ed0SStefano Zampini matrixT->values->begin())); 1403a49f1ed0SStefano Zampini } 1404ee7b52eaSHong Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 140585ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1406213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1407213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1408aa372e3fSPaul Mullowney /* assign the pointer */ 1409aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14101a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1411bda325fcSPaul Mullowney PetscFunctionReturn(0); 1412bda325fcSPaul Mullowney } 1413bda325fcSPaul Mullowney 1414a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14156fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1416bda325fcSPaul Mullowney { 1417c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1418465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1419465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1420465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1421465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1422bda325fcSPaul Mullowney cusparseStatus_t stat; 1423bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1424aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1425aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1426aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1427b175d8bbSPaul Mullowney PetscErrorCode ierr; 1428bda325fcSPaul Mullowney 1429bda325fcSPaul Mullowney PetscFunctionBegin; 1430aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1431aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1432bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1433aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1434aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1435bda325fcSPaul Mullowney } 1436bda325fcSPaul Mullowney 1437bda325fcSPaul Mullowney /* Get the GPU pointers */ 1438c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1439c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1440c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1441c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1442bda325fcSPaul Mullowney 14437a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1444aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1445a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1446c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1447c41cb2e2SAlejandro Lamas Daviña xGPU); 1448aa372e3fSPaul Mullowney 1449aa372e3fSPaul Mullowney /* First, solve U */ 1450aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1451afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1454afb2bd1cSJunchao Zhang #endif 1455afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1456aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1457aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1458aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1459aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1460d49cd2b7SBarry Smith xarray, 14611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462d49cd2b7SBarry Smith tempGPU->data().get(), 1463d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1464d49cd2b7SBarry Smith #else 1465d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1466afb2bd1cSJunchao Zhang #endif 1467aa372e3fSPaul Mullowney 1468aa372e3fSPaul Mullowney /* Then, solve L */ 1469aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1473afb2bd1cSJunchao Zhang #endif 1474afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1476aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1477aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1478aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1479d49cd2b7SBarry Smith tempGPU->data().get(), 14801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481d49cd2b7SBarry Smith xarray, 1482d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1483d49cd2b7SBarry Smith #else 1484d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1485afb2bd1cSJunchao Zhang #endif 1486aa372e3fSPaul Mullowney 1487aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1488a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1489c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1490aa372e3fSPaul Mullowney tempGPU->begin()); 1491aa372e3fSPaul Mullowney 1492aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1493a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1494bda325fcSPaul Mullowney 1495bda325fcSPaul Mullowney /* restore */ 1496c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1497c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1498661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1499958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1500bda325fcSPaul Mullowney PetscFunctionReturn(0); 1501bda325fcSPaul Mullowney } 1502bda325fcSPaul Mullowney 15036fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1504bda325fcSPaul Mullowney { 1505465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1506465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1507bda325fcSPaul Mullowney cusparseStatus_t stat; 1508bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1509aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1510aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1511aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1512b175d8bbSPaul Mullowney PetscErrorCode ierr; 1513bda325fcSPaul Mullowney 1514bda325fcSPaul Mullowney PetscFunctionBegin; 1515aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1516aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1517bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520bda325fcSPaul Mullowney } 1521bda325fcSPaul Mullowney 1522bda325fcSPaul Mullowney /* Get the GPU pointers */ 1523c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525bda325fcSPaul Mullowney 15267a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527aa372e3fSPaul Mullowney /* First, solve U */ 1528aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1532afb2bd1cSJunchao Zhang #endif 1533afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1535aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1536aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1537aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1538d49cd2b7SBarry Smith barray, 15391b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540d49cd2b7SBarry Smith tempGPU->data().get(), 1541d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1542d49cd2b7SBarry Smith #else 1543d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1544afb2bd1cSJunchao Zhang #endif 1545aa372e3fSPaul Mullowney 1546aa372e3fSPaul Mullowney /* Then, solve L */ 1547aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1548afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15491b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1551afb2bd1cSJunchao Zhang #endif 1552afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1553aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1554aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1555aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1556aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1557d49cd2b7SBarry Smith tempGPU->data().get(), 15581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559d49cd2b7SBarry Smith xarray, 1560d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1561d49cd2b7SBarry Smith #else 1562d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1563afb2bd1cSJunchao Zhang #endif 1564bda325fcSPaul Mullowney 1565bda325fcSPaul Mullowney /* restore */ 1566c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1567c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1568661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1570bda325fcSPaul Mullowney PetscFunctionReturn(0); 1571bda325fcSPaul Mullowney } 1572bda325fcSPaul Mullowney 15736fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15749ae82921SPaul Mullowney { 1575465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1576465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1577465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1578465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15799ae82921SPaul Mullowney cusparseStatus_t stat; 15809ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1581aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1582aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1583aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1584b175d8bbSPaul Mullowney PetscErrorCode ierr; 15859ae82921SPaul Mullowney 15869ae82921SPaul Mullowney PetscFunctionBegin; 1587ebc8f436SDominic Meiser 1588e057df02SPaul Mullowney /* Get the GPU pointers */ 1589c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1590c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1591c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1592c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15939ae82921SPaul Mullowney 15947a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1595aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1596a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1597c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15984e4bbfaaSStefano Zampini tempGPU->begin()); 1599aa372e3fSPaul Mullowney 1600aa372e3fSPaul Mullowney /* Next, solve L */ 1601aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1602afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1604afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1605afb2bd1cSJunchao Zhang #endif 1606afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1607aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1608aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1609aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1610aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1611d49cd2b7SBarry Smith tempGPU->data().get(), 16121b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613d49cd2b7SBarry Smith xarray, 1614d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1615d49cd2b7SBarry Smith #else 1616d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1617afb2bd1cSJunchao Zhang #endif 1618aa372e3fSPaul Mullowney 1619aa372e3fSPaul Mullowney /* Then, solve U */ 1620aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1621afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1624afb2bd1cSJunchao Zhang #endif 1625afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1626aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1627aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1628aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1629d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 16301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631d49cd2b7SBarry Smith tempGPU->data().get(), 1632d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1633d49cd2b7SBarry Smith #else 1634d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1635afb2bd1cSJunchao Zhang #endif 1636d49cd2b7SBarry Smith 16374e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1638a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16394e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16404e4bbfaaSStefano Zampini xGPU); 16419ae82921SPaul Mullowney 1642c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1643c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1644661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1645958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16469ae82921SPaul Mullowney PetscFunctionReturn(0); 16479ae82921SPaul Mullowney } 16489ae82921SPaul Mullowney 16496fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16509ae82921SPaul Mullowney { 1651465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1652465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16539ae82921SPaul Mullowney cusparseStatus_t stat; 16549ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1655aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1656aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1657aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1658b175d8bbSPaul Mullowney PetscErrorCode ierr; 16599ae82921SPaul Mullowney 16609ae82921SPaul Mullowney PetscFunctionBegin; 1661e057df02SPaul Mullowney /* Get the GPU pointers */ 1662c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1663c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 16649ae82921SPaul Mullowney 16657a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1666aa372e3fSPaul Mullowney /* First, solve L */ 1667aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1668afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16691b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1670afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1671afb2bd1cSJunchao Zhang #endif 1672afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1673aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1674aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1675aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1676aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1677d49cd2b7SBarry Smith barray, 16781b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679d49cd2b7SBarry Smith tempGPU->data().get(), 1680d49cd2b7SBarry Smith loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1681d49cd2b7SBarry Smith #else 1682d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1683afb2bd1cSJunchao Zhang #endif 1684d49cd2b7SBarry Smith 1685aa372e3fSPaul Mullowney /* Next, solve U */ 1686aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1687afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16881b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1690afb2bd1cSJunchao Zhang #endif 1691afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1692aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1693aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1694aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1695aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1696d49cd2b7SBarry Smith tempGPU->data().get(), 16971b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698d49cd2b7SBarry Smith xarray, 1699d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1700d49cd2b7SBarry Smith #else 1701d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1702afb2bd1cSJunchao Zhang #endif 17039ae82921SPaul Mullowney 1704c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1705c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1706661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1707958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17089ae82921SPaul Mullowney PetscFunctionReturn(0); 17099ae82921SPaul Mullowney } 17109ae82921SPaul Mullowney 17117e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17127e8381f9SStefano Zampini { 17137e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17147e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17157e8381f9SStefano Zampini cudaError_t cerr; 17167e8381f9SStefano Zampini PetscErrorCode ierr; 17177e8381f9SStefano Zampini 17187e8381f9SStefano Zampini PetscFunctionBegin; 17197e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17207e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17217e8381f9SStefano Zampini 17227e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17237e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 17247e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 17257e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 17267e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17277e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17287e8381f9SStefano Zampini } 17297e8381f9SStefano Zampini PetscFunctionReturn(0); 17307e8381f9SStefano Zampini } 17317e8381f9SStefano Zampini 17327e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17337e8381f9SStefano Zampini { 17347e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17357e8381f9SStefano Zampini PetscErrorCode ierr; 17367e8381f9SStefano Zampini 17377e8381f9SStefano Zampini PetscFunctionBegin; 17387e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 17397e8381f9SStefano Zampini *array = a->a; 17407e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 17417e8381f9SStefano Zampini PetscFunctionReturn(0); 17427e8381f9SStefano Zampini } 17437e8381f9SStefano Zampini 1744042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 17459ae82921SPaul Mullowney { 1746aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 17477c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 17489ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1749213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 17509ae82921SPaul Mullowney PetscErrorCode ierr; 1751aa372e3fSPaul Mullowney cusparseStatus_t stat; 1752abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1753b06137fdSPaul Mullowney cudaError_t err; 17549ae82921SPaul Mullowney 17559ae82921SPaul Mullowney PetscFunctionBegin; 1756e8d2b73aSMark Adams if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1757c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1758a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1759a49f1ed0SStefano Zampini CsrMatrix *matrix; 1760afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 176185ba7357SStefano Zampini 1762e8d2b73aSMark Adams if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 176385ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1764afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 176505035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 17664863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 176785ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1768a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 176934d6c7a5SJose E. Roman } else { 1770abb89eb1SStefano Zampini PetscInt nnz; 177185ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17727c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1773a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 17747c700b8dSJunchao Zhang delete cusparsestruct->workVector; 177581902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1776a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1777a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 17789ae82921SPaul Mullowney try { 17799ae82921SPaul Mullowney if (a->compressedrow.use) { 17809ae82921SPaul Mullowney m = a->compressedrow.nrows; 17819ae82921SPaul Mullowney ii = a->compressedrow.i; 17829ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 17839ae82921SPaul Mullowney } else { 1784213423ffSJunchao Zhang m = A->rmap->n; 1785213423ffSJunchao Zhang ii = a->i; 1786e6e9a74fSStefano Zampini ridx = NULL; 17879ae82921SPaul Mullowney } 1788e8d2b73aSMark Adams if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1789e8d2b73aSMark Adams if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1790abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1791abb89eb1SStefano Zampini else nnz = a->nz; 17929ae82921SPaul Mullowney 179385ba7357SStefano Zampini /* create cusparse matrix */ 1794abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1795aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 179657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 179757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 179857d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 17999ae82921SPaul Mullowney 1800afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 18017656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 18027656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1803afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 18047656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 18057656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 180657d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1807b06137fdSPaul Mullowney 1808aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1809aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1810aa372e3fSPaul Mullowney /* set the matrix */ 1811afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1812afb2bd1cSJunchao Zhang mat->num_rows = m; 1813afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1814abb89eb1SStefano Zampini mat->num_entries = nnz; 1815afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1816afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18179ae82921SPaul Mullowney 1818abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1819abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1820aa372e3fSPaul Mullowney 1821abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1822abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1823aa372e3fSPaul Mullowney 1824aa372e3fSPaul Mullowney /* assign the pointer */ 1825afb2bd1cSJunchao Zhang matstruct->mat = mat; 1826afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1827afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1828afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1829afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1830afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1831afb2bd1cSJunchao Zhang mat->values->data().get(), 1832afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1833afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1834afb2bd1cSJunchao Zhang } 1835afb2bd1cSJunchao Zhang #endif 1836aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1837afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1838afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1839afb2bd1cSJunchao Zhang #else 1840afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1841afb2bd1cSJunchao Zhang mat->num_rows = m; 1842afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1843abb89eb1SStefano Zampini mat->num_entries = nnz; 1844afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1845afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1846aa372e3fSPaul Mullowney 1847abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1848abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1849aa372e3fSPaul Mullowney 1850abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1851abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1852aa372e3fSPaul Mullowney 1853aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 185457d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1855aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1856aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1857afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1858afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1859afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1860afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 186157d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1862aa372e3fSPaul Mullowney /* assign the pointer */ 1863aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1864aa372e3fSPaul Mullowney 1865afb2bd1cSJunchao Zhang if (mat) { 1866afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1867afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1868afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1869afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1870087f3262SPaul Mullowney } 1871afb2bd1cSJunchao Zhang #endif 1872087f3262SPaul Mullowney } 1873ca45077fSPaul Mullowney 1874aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1875213423ffSJunchao Zhang if (a->compressedrow.use) { 1876213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1877aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1878aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1879213423ffSJunchao Zhang tmp = m; 1880213423ffSJunchao Zhang } else { 1881213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1882213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1883213423ffSJunchao Zhang tmp = 0; 1884213423ffSJunchao Zhang } 1885213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1886aa372e3fSPaul Mullowney 1887aa372e3fSPaul Mullowney /* assign the pointer */ 1888aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 18899ae82921SPaul Mullowney } catch(char *ex) { 18909ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18919ae82921SPaul Mullowney } 189205035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 189385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 189434d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 189534d6c7a5SJose E. Roman } 1896abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 18979ae82921SPaul Mullowney } 18989ae82921SPaul Mullowney PetscFunctionReturn(0); 18999ae82921SPaul Mullowney } 19009ae82921SPaul Mullowney 1901c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1902aa372e3fSPaul Mullowney { 1903aa372e3fSPaul Mullowney template <typename Tuple> 1904aa372e3fSPaul Mullowney __host__ __device__ 1905aa372e3fSPaul Mullowney void operator()(Tuple t) 1906aa372e3fSPaul Mullowney { 1907aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1908aa372e3fSPaul Mullowney } 1909aa372e3fSPaul Mullowney }; 1910aa372e3fSPaul Mullowney 19117e8381f9SStefano Zampini struct VecCUDAEquals 19127e8381f9SStefano Zampini { 19137e8381f9SStefano Zampini template <typename Tuple> 19147e8381f9SStefano Zampini __host__ __device__ 19157e8381f9SStefano Zampini void operator()(Tuple t) 19167e8381f9SStefano Zampini { 19177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19187e8381f9SStefano Zampini } 19197e8381f9SStefano Zampini }; 19207e8381f9SStefano Zampini 1921e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1922e6e9a74fSStefano Zampini { 1923e6e9a74fSStefano Zampini template <typename Tuple> 1924e6e9a74fSStefano Zampini __host__ __device__ 1925e6e9a74fSStefano Zampini void operator()(Tuple t) 1926e6e9a74fSStefano Zampini { 1927e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1928e6e9a74fSStefano Zampini } 1929e6e9a74fSStefano Zampini }; 1930e6e9a74fSStefano Zampini 1931afb2bd1cSJunchao Zhang struct MatMatCusparse { 1932ccdfe979SStefano Zampini PetscBool cisdense; 1933ccdfe979SStefano Zampini PetscScalar *Bt; 1934ccdfe979SStefano Zampini Mat X; 1935fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1936fcdce8c4SStefano Zampini PetscLogDouble flops; 1937fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 1938*b4285af6SJunchao Zhang 1939afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1940fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 1941afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1942afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1943afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1944afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1945*b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1946*b4285af6SJunchao Zhang void *dBuffer4; 1947*b4285af6SJunchao Zhang void *dBuffer5; 1948*b4285af6SJunchao Zhang #endif 1949fcdce8c4SStefano Zampini size_t mmBufferSize; 1950fcdce8c4SStefano Zampini void *mmBuffer; 1951fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1952fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 1953afb2bd1cSJunchao Zhang #endif 1954afb2bd1cSJunchao Zhang }; 1955ccdfe979SStefano Zampini 1956ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1957ccdfe979SStefano Zampini { 1958ccdfe979SStefano Zampini PetscErrorCode ierr; 1959ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1960ccdfe979SStefano Zampini cudaError_t cerr; 1961fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1962fcdce8c4SStefano Zampini cusparseStatus_t stat; 1963fcdce8c4SStefano Zampini #endif 1964ccdfe979SStefano Zampini 1965ccdfe979SStefano Zampini PetscFunctionBegin; 1966ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1967fcdce8c4SStefano Zampini delete mmdata->Bcsr; 1968afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1969fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1970afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1971afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1972fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1973*b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1974*b4285af6SJunchao Zhang if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1975*b4285af6SJunchao Zhang if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 1976*b4285af6SJunchao Zhang #endif 1977*b4285af6SJunchao Zhang if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1978*b4285af6SJunchao Zhang if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1979afb2bd1cSJunchao Zhang #endif 1980ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1981ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1982ccdfe979SStefano Zampini PetscFunctionReturn(0); 1983ccdfe979SStefano Zampini } 1984ccdfe979SStefano Zampini 1985ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1986ccdfe979SStefano Zampini 1987ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1988ccdfe979SStefano Zampini { 1989ccdfe979SStefano Zampini Mat_Product *product = C->product; 1990ccdfe979SStefano Zampini Mat A,B; 1991afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1992ccdfe979SStefano Zampini PetscBool flg,biscuda; 1993ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1994ccdfe979SStefano Zampini cusparseStatus_t stat; 1995ccdfe979SStefano Zampini cusparseOperation_t opA; 1996ccdfe979SStefano Zampini const PetscScalar *barray; 1997ccdfe979SStefano Zampini PetscScalar *carray; 1998ccdfe979SStefano Zampini PetscErrorCode ierr; 1999ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2000ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2001ccdfe979SStefano Zampini CsrMatrix *csrmat; 2002ccdfe979SStefano Zampini 2003ccdfe979SStefano Zampini PetscFunctionBegin; 2004ccdfe979SStefano Zampini MatCheckProduct(C,1); 2005e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2006ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2007ccdfe979SStefano Zampini A = product->A; 2008ccdfe979SStefano Zampini B = product->B; 2009ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2010e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2011ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2012ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 2013ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2014ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2015ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2016ccdfe979SStefano Zampini switch (product->type) { 2017ccdfe979SStefano Zampini case MATPRODUCT_AB: 2018ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2019ccdfe979SStefano Zampini mat = cusp->mat; 2020ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2021ccdfe979SStefano Zampini m = A->rmap->n; 2022ccdfe979SStefano Zampini n = B->cmap->n; 2023ccdfe979SStefano Zampini break; 2024ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20251a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2026e6e9a74fSStefano Zampini mat = cusp->mat; 2027e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2028e6e9a74fSStefano Zampini } else { 20291a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2030ccdfe979SStefano Zampini mat = cusp->matTranspose; 2031ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2032e6e9a74fSStefano Zampini } 2033ccdfe979SStefano Zampini m = A->cmap->n; 2034ccdfe979SStefano Zampini n = B->cmap->n; 2035ccdfe979SStefano Zampini break; 2036ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2037ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2038ccdfe979SStefano Zampini mat = cusp->mat; 2039ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2040ccdfe979SStefano Zampini m = A->rmap->n; 2041ccdfe979SStefano Zampini n = B->rmap->n; 2042ccdfe979SStefano Zampini break; 2043ccdfe979SStefano Zampini default: 2044e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2045ccdfe979SStefano Zampini } 2046e8d2b73aSMark Adams if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2047ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2048ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2049ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2050afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2051ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2052afb2bd1cSJunchao Zhang 2053ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2054c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2055c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2056c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2057c8378d12SStefano Zampini } else { 2058c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2059c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2060c8378d12SStefano Zampini } 2061c8378d12SStefano Zampini 2062c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2063afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2064afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2065fcdce8c4SStefano Zampini /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2066afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2067fcdce8c4SStefano Zampini size_t mmBufferSize; 2068afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2069afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2070afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2071afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2072afb2bd1cSJunchao Zhang } 2073c8378d12SStefano Zampini 2074afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2075afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2076afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2077afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2078afb2bd1cSJunchao Zhang } 2079afb2bd1cSJunchao Zhang 2080afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2081afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2082afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2083afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2084afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2085afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2086afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2087afb2bd1cSJunchao Zhang } 2088afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2089afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2090afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2091fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2092fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2093ee7b52eaSHong Zhang cudaError_t cerr; 2094fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2095fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2096fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2097fcdce8c4SStefano Zampini } 2098afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2099afb2bd1cSJunchao Zhang } else { 2100afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2101afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2102afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2103afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2104afb2bd1cSJunchao Zhang } 2105afb2bd1cSJunchao Zhang 2106afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2107afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2108afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2109afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2110fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2111afb2bd1cSJunchao Zhang #else 2112afb2bd1cSJunchao Zhang PetscInt k; 2113afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2114ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2115ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2116ccdfe979SStefano Zampini cublasStatus_t cerr; 2117ccdfe979SStefano Zampini 2118ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2119ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2120ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2121ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2122ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2123ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2124ccdfe979SStefano Zampini blda = B->cmap->n; 2125afb2bd1cSJunchao Zhang k = B->cmap->n; 2126afb2bd1cSJunchao Zhang } else { 2127afb2bd1cSJunchao Zhang k = B->rmap->n; 2128ccdfe979SStefano Zampini } 2129ccdfe979SStefano Zampini 2130afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2131ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2132afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2133ccdfe979SStefano Zampini csrmat->values->data().get(), 2134ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2135ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2136ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2137ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2138afb2bd1cSJunchao Zhang #endif 2139c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2140c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2141ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2142ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2143ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2144ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2145ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2146ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2147ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2148ccdfe979SStefano Zampini } else { 2149ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2150ccdfe979SStefano Zampini } 2151ccdfe979SStefano Zampini if (mmdata->cisdense) { 2152ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2153ccdfe979SStefano Zampini } 2154ccdfe979SStefano Zampini if (!biscuda) { 2155ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2156ccdfe979SStefano Zampini } 2157ccdfe979SStefano Zampini PetscFunctionReturn(0); 2158ccdfe979SStefano Zampini } 2159ccdfe979SStefano Zampini 2160ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2161ccdfe979SStefano Zampini { 2162ccdfe979SStefano Zampini Mat_Product *product = C->product; 2163ccdfe979SStefano Zampini Mat A,B; 2164ccdfe979SStefano Zampini PetscInt m,n; 2165ccdfe979SStefano Zampini PetscBool cisdense,flg; 2166ccdfe979SStefano Zampini PetscErrorCode ierr; 2167ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2168ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2169ccdfe979SStefano Zampini 2170ccdfe979SStefano Zampini PetscFunctionBegin; 2171ccdfe979SStefano Zampini MatCheckProduct(C,1); 2172e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2173ccdfe979SStefano Zampini A = product->A; 2174ccdfe979SStefano Zampini B = product->B; 2175ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2176e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2177ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2178e8d2b73aSMark Adams if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2179ccdfe979SStefano Zampini switch (product->type) { 2180ccdfe979SStefano Zampini case MATPRODUCT_AB: 2181ccdfe979SStefano Zampini m = A->rmap->n; 2182ccdfe979SStefano Zampini n = B->cmap->n; 2183ccdfe979SStefano Zampini break; 2184ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2185ccdfe979SStefano Zampini m = A->cmap->n; 2186ccdfe979SStefano Zampini n = B->cmap->n; 2187ccdfe979SStefano Zampini break; 2188ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2189ccdfe979SStefano Zampini m = A->rmap->n; 2190ccdfe979SStefano Zampini n = B->rmap->n; 2191ccdfe979SStefano Zampini break; 2192ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2193ccdfe979SStefano Zampini m = B->cmap->n; 2194ccdfe979SStefano Zampini n = B->cmap->n; 2195ccdfe979SStefano Zampini break; 2196ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2197ccdfe979SStefano Zampini m = B->rmap->n; 2198ccdfe979SStefano Zampini n = B->rmap->n; 2199ccdfe979SStefano Zampini break; 2200ccdfe979SStefano Zampini default: 2201e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2202ccdfe979SStefano Zampini } 2203ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2204ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2205ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2206ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2207ccdfe979SStefano Zampini 2208ccdfe979SStefano Zampini /* product data */ 2209ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2210ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2211afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2212afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2213ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2214afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2215ccdfe979SStefano Zampini } 2216afb2bd1cSJunchao Zhang #endif 2217ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2218ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2219ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2220ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2221ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2222ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2223ccdfe979SStefano Zampini } else { 2224ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2225ccdfe979SStefano Zampini } 2226ccdfe979SStefano Zampini } 2227ccdfe979SStefano Zampini C->product->data = mmdata; 2228ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2229ccdfe979SStefano Zampini 2230ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2231ccdfe979SStefano Zampini PetscFunctionReturn(0); 2232ccdfe979SStefano Zampini } 2233ccdfe979SStefano Zampini 2234fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2235ccdfe979SStefano Zampini { 2236ccdfe979SStefano Zampini Mat_Product *product = C->product; 2237fcdce8c4SStefano Zampini Mat A,B; 2238fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2239fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2240fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2241fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2242fcdce8c4SStefano Zampini PetscBool flg; 2243ccdfe979SStefano Zampini PetscErrorCode ierr; 2244fcdce8c4SStefano Zampini cusparseStatus_t stat; 2245fcdce8c4SStefano Zampini cudaError_t cerr; 2246fcdce8c4SStefano Zampini MatProductType ptype; 2247fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2248fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2249fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2250fcdce8c4SStefano Zampini #endif 2251*b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2252ccdfe979SStefano Zampini 2253ccdfe979SStefano Zampini PetscFunctionBegin; 2254ccdfe979SStefano Zampini MatCheckProduct(C,1); 2255e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2256fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2257e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2258fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2259fcdce8c4SStefano Zampini A = product->A; 2260fcdce8c4SStefano Zampini B = product->B; 2261fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2262fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2263fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2264e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2265fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2266e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2267fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2268e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2269fcdce8c4SStefano Zampini goto finalize; 2270fcdce8c4SStefano Zampini } 2271fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2272fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2273e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2274fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2275e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2276fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2277fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2278fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2279fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2280fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2281e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2282e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2283e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2284fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2285fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2286fcdce8c4SStefano Zampini 2287fcdce8c4SStefano Zampini ptype = product->type; 2288fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2289fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2290fcdce8c4SStefano Zampini switch (ptype) { 2291fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2292fcdce8c4SStefano Zampini Amat = Acusp->mat; 2293fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2294fcdce8c4SStefano Zampini break; 2295fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2296fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2297fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2298fcdce8c4SStefano Zampini break; 2299fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2300fcdce8c4SStefano Zampini Amat = Acusp->mat; 2301fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2302fcdce8c4SStefano Zampini break; 2303fcdce8c4SStefano Zampini default: 2304e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2305fcdce8c4SStefano Zampini } 2306fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2307e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2308e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2309e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2310fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2311fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2312fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2313e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2314e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2315e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2316fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2317fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2319*b4285af6SJunchao Zhang stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2320*b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2321*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2322*b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2323*b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2324*b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2325*b4285af6SJunchao Zhang #else 2326*b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2327fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2328fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2329fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2330*b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2331fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2332fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2333*b4285af6SJunchao Zhang #endif 2334fcdce8c4SStefano Zampini #else 2335*b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2336fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2337fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2338fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2339fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2340fcdce8c4SStefano Zampini #endif 2341fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2342fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2343fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2344fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2345fcdce8c4SStefano Zampini finalize: 2346fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2347fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2348fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2349fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2350fcdce8c4SStefano Zampini c->reallocs = 0; 2351fcdce8c4SStefano Zampini C->info.mallocs += 0; 2352fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2353fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2354fcdce8c4SStefano Zampini C->num_ass++; 2355ccdfe979SStefano Zampini PetscFunctionReturn(0); 2356ccdfe979SStefano Zampini } 2357fcdce8c4SStefano Zampini 2358fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2359fcdce8c4SStefano Zampini { 2360fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2361fcdce8c4SStefano Zampini Mat A,B; 2362fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2363fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2364fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2365fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2366fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2367fcdce8c4SStefano Zampini PetscBool flg; 2368fcdce8c4SStefano Zampini PetscErrorCode ierr; 2369fcdce8c4SStefano Zampini cusparseStatus_t stat; 2370fcdce8c4SStefano Zampini cudaError_t cerr; 2371fcdce8c4SStefano Zampini MatProductType ptype; 2372fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2373fcdce8c4SStefano Zampini PetscLogDouble flops; 2374fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2375fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2376fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2377fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2378fcdce8c4SStefano Zampini #else 2379fcdce8c4SStefano Zampini int cnz; 2380fcdce8c4SStefano Zampini #endif 2381*b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2382fcdce8c4SStefano Zampini 2383fcdce8c4SStefano Zampini PetscFunctionBegin; 2384fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2385e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2386fcdce8c4SStefano Zampini A = product->A; 2387fcdce8c4SStefano Zampini B = product->B; 2388fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2389e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2390fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2391e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2392fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2393fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2394fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2395fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2396e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2397e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2398fcdce8c4SStefano Zampini 2399fcdce8c4SStefano Zampini /* product data */ 2400fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2401fcdce8c4SStefano Zampini C->product->data = mmdata; 2402fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2403fcdce8c4SStefano Zampini 2404fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2405fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2406fcdce8c4SStefano Zampini ptype = product->type; 2407fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2408fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2409fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2410fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2411fcdce8c4SStefano Zampini switch (ptype) { 2412fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2413fcdce8c4SStefano Zampini m = A->rmap->n; 2414fcdce8c4SStefano Zampini n = B->cmap->n; 2415fcdce8c4SStefano Zampini k = A->cmap->n; 2416fcdce8c4SStefano Zampini Amat = Acusp->mat; 2417fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2418fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2419fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2420fcdce8c4SStefano Zampini break; 2421fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2422fcdce8c4SStefano Zampini m = A->cmap->n; 2423fcdce8c4SStefano Zampini n = B->cmap->n; 2424fcdce8c4SStefano Zampini k = A->rmap->n; 24251a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2426fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2427fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2428fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2429fcdce8c4SStefano Zampini break; 2430fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2431fcdce8c4SStefano Zampini m = A->rmap->n; 2432fcdce8c4SStefano Zampini n = B->rmap->n; 2433fcdce8c4SStefano Zampini k = A->cmap->n; 24341a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2435fcdce8c4SStefano Zampini Amat = Acusp->mat; 2436fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2437fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2438fcdce8c4SStefano Zampini break; 2439fcdce8c4SStefano Zampini default: 2440e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2441fcdce8c4SStefano Zampini } 2442fcdce8c4SStefano Zampini 2443fcdce8c4SStefano Zampini /* create cusparse matrix */ 2444fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2445fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2446fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2447fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2448fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2449fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2450fcdce8c4SStefano Zampini 2451fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2452fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2453fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2454fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2455fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2456fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2457fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2458fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2459fcdce8c4SStefano Zampini } else { 2460fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2461fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2462fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2463fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2464fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2465fcdce8c4SStefano Zampini } 2466fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2467fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2468fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2469fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2470fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2471fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2472fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2473fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2474fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2475fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2476fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2477fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2478fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2479fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2480fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2481fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2482fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2483fcdce8c4SStefano Zampini c->nz = 0; 2484fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2485fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2486fcdce8c4SStefano Zampini goto finalizesym; 2487fcdce8c4SStefano Zampini } 2488fcdce8c4SStefano Zampini 2489e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2490e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2491fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2492fcdce8c4SStefano Zampini if (!biscompressed) { 2493fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2494fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2495fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2496fcdce8c4SStefano Zampini #endif 2497fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2498fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2499fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2500fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2501fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2502fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2503fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2504fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2505fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2506fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2507fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2508fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2509fcdce8c4SStefano Zampini } 2510fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2511fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2512fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2513fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2514fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2515fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2516fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2517fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2518fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2519fcdce8c4SStefano Zampini } 2520fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2521fcdce8c4SStefano Zampini #endif 2522fcdce8c4SStefano Zampini } 2523e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2524e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2525fcdce8c4SStefano Zampini /* precompute flops count */ 2526fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2527fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2528fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2529fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2530fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2531fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2532fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2533fcdce8c4SStefano Zampini } 2534fcdce8c4SStefano Zampini } 2535fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2536fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2537fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2538fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2539fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2540fcdce8c4SStefano Zampini } 2541fcdce8c4SStefano Zampini } else { /* TODO */ 2542fcdce8c4SStefano Zampini flops = 0.; 2543fcdce8c4SStefano Zampini } 2544fcdce8c4SStefano Zampini 2545fcdce8c4SStefano Zampini mmdata->flops = flops; 2546fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2547*b4285af6SJunchao Zhang 2548fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2549fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2550fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2551fcdce8c4SStefano Zampini NULL, NULL, NULL, 2552fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2553fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2554fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2555*b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2556*b4285af6SJunchao Zhang { 2557*b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2558*b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2559*b4285af6SJunchao Zhang */ 2560*b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2561*b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2562*b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2563*b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2564*b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2565*b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2566*b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2567*b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2568*b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2569*b4285af6SJunchao Zhang 2570*b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2571*b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2572*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2573*b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2574*b4285af6SJunchao Zhang &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2575*b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2576*b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2577*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2578*b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2579*b4285af6SJunchao Zhang &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2580*b4285af6SJunchao Zhang 2581*b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2582*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2583*b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2584*b4285af6SJunchao Zhang &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2585*b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2586*b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2587*b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2588*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2589*b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2590*b4285af6SJunchao Zhang &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2591*b4285af6SJunchao Zhang cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2592*b4285af6SJunchao Zhang cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2593*b4285af6SJunchao Zhang 2594*b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2595*b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 2596*b4285af6SJunchao Zhang stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2597*b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2598*b4285af6SJunchao Zhang /* allocate matrix C */ 2599*b4285af6SJunchao Zhang Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2600*b4285af6SJunchao Zhang Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2601*b4285af6SJunchao Zhang /* update matC with the new pointers */ 2602*b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2603*b4285af6SJunchao Zhang Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2604*b4285af6SJunchao Zhang 2605*b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2606*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2607*b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2608*b4285af6SJunchao Zhang &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2609*b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2610*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2611*b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2612*b4285af6SJunchao Zhang &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2613*b4285af6SJunchao Zhang cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2614*b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2615*b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2616*b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2617*b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2618*b4285af6SJunchao Zhang ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2619*b4285af6SJunchao Zhang } 2620*b4285af6SJunchao Zhang #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2621*b4285af6SJunchao Zhang size_t bufSize2; 2622fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2623*b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2624fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2625fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2626fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2627bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2628fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2629*b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2630fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2631fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2632fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2633fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2634*b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2635fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2636fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2637fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2638fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2639fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2640fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2641fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2642fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2643bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2644fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2645*b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2646fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2647fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2648fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2649fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2650fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2651fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 265200702c57SStefano Zampini ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2653fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2654fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2655fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2656fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2657fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2658fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2659*b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2660fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2661fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2662*b4285af6SJunchao Zhang #endif 2663fcdce8c4SStefano Zampini #else 2664fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2665*b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2666fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2667fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2668fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2669fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2670fcdce8c4SStefano Zampini c->nz = cnz; 2671fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2672fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2673fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2674fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2675fcdce8c4SStefano Zampini 2676fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2677fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2678fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2679fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2680*b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2681fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2682fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2683fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2684fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2685fcdce8c4SStefano Zampini #endif 2686fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2687fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2688fcdce8c4SStefano Zampini finalizesym: 2689fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2690fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2691fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2692fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2693fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2694fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2695fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2696fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2697fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2698fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2699fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2700fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2701fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2702fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2703fcdce8c4SStefano Zampini } else { 2704fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2705fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2706fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2707fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2708fcdce8c4SStefano Zampini } 2709fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2710fcdce8c4SStefano Zampini PetscInt r = 0; 2711fcdce8c4SStefano Zampini c->i[0] = 0; 2712fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2713fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2714fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2715fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2716fcdce8c4SStefano Zampini } 2717fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2718fcdce8c4SStefano Zampini } 2719fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2720fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2721fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2722fcdce8c4SStefano Zampini c->maxnz = c->nz; 2723fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2724fcdce8c4SStefano Zampini c->rmax = 0; 2725fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2726fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2727fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2728fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2729fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2730fcdce8c4SStefano Zampini } 2731fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2732fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2733fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2734fcdce8c4SStefano Zampini 2735fcdce8c4SStefano Zampini C->nonzerostate++; 2736fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2737fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2738fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2739fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2740fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2741fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2742fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2743abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2744fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2745fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2746fcdce8c4SStefano Zampini } 2747fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2748fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2749fcdce8c4SStefano Zampini } 2750fcdce8c4SStefano Zampini 2751fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2752fcdce8c4SStefano Zampini 2753fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2754fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2755fcdce8c4SStefano Zampini { 2756fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2757fcdce8c4SStefano Zampini PetscErrorCode ierr; 2758fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2759fcdce8c4SStefano Zampini 2760fcdce8c4SStefano Zampini PetscFunctionBegin; 2761fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2762fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2763abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2764fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2765fcdce8c4SStefano Zampini } 2766fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2767fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2768fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2769fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2770fcdce8c4SStefano Zampini } 2771fcdce8c4SStefano Zampini } 2772fcdce8c4SStefano Zampini if (isdense) { 2773ccdfe979SStefano Zampini switch (product->type) { 2774ccdfe979SStefano Zampini case MATPRODUCT_AB: 2775ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2776ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2777ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2778ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2779fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2780fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2781fcdce8c4SStefano Zampini } else { 2782fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2783fcdce8c4SStefano Zampini } 2784fcdce8c4SStefano Zampini break; 2785fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2786fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2787fcdce8c4SStefano Zampini break; 2788ccdfe979SStefano Zampini default: 2789ccdfe979SStefano Zampini break; 2790ccdfe979SStefano Zampini } 2791fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2792fcdce8c4SStefano Zampini switch (product->type) { 2793fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2794fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2795fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2796fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2797fcdce8c4SStefano Zampini break; 2798fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2799fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2800fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2801fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2802fcdce8c4SStefano Zampini break; 2803fcdce8c4SStefano Zampini default: 2804fcdce8c4SStefano Zampini break; 2805fcdce8c4SStefano Zampini } 2806fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 2807fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2808fcdce8c4SStefano Zampini } 2809ccdfe979SStefano Zampini PetscFunctionReturn(0); 2810ccdfe979SStefano Zampini } 2811ccdfe979SStefano Zampini 28126fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 28139ae82921SPaul Mullowney { 2814b175d8bbSPaul Mullowney PetscErrorCode ierr; 28159ae82921SPaul Mullowney 28169ae82921SPaul Mullowney PetscFunctionBegin; 2817e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2818e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2819e6e9a74fSStefano Zampini } 2820e6e9a74fSStefano Zampini 2821e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2822e6e9a74fSStefano Zampini { 2823e6e9a74fSStefano Zampini PetscErrorCode ierr; 2824e6e9a74fSStefano Zampini 2825e6e9a74fSStefano Zampini PetscFunctionBegin; 2826e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2827e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2828e6e9a74fSStefano Zampini } 2829e6e9a74fSStefano Zampini 2830e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2831e6e9a74fSStefano Zampini { 2832e6e9a74fSStefano Zampini PetscErrorCode ierr; 2833e6e9a74fSStefano Zampini 2834e6e9a74fSStefano Zampini PetscFunctionBegin; 2835e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2836e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2837e6e9a74fSStefano Zampini } 2838e6e9a74fSStefano Zampini 2839e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2840e6e9a74fSStefano Zampini { 2841e6e9a74fSStefano Zampini PetscErrorCode ierr; 2842e6e9a74fSStefano Zampini 2843e6e9a74fSStefano Zampini PetscFunctionBegin; 2844e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 28459ae82921SPaul Mullowney PetscFunctionReturn(0); 28469ae82921SPaul Mullowney } 28479ae82921SPaul Mullowney 28486fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2849ca45077fSPaul Mullowney { 2850b175d8bbSPaul Mullowney PetscErrorCode ierr; 2851ca45077fSPaul Mullowney 2852ca45077fSPaul Mullowney PetscFunctionBegin; 2853e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2854ca45077fSPaul Mullowney PetscFunctionReturn(0); 2855ca45077fSPaul Mullowney } 2856ca45077fSPaul Mullowney 2857a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2858a0e72f99SJunchao Zhang { 2859a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2860a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2861a0e72f99SJunchao Zhang } 2862a0e72f99SJunchao Zhang 2863afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2864e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 28659ae82921SPaul Mullowney { 28669ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2867aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 28689ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2869e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2870b175d8bbSPaul Mullowney PetscErrorCode ierr; 2871aa372e3fSPaul Mullowney cusparseStatus_t stat; 2872e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2873e6e9a74fSStefano Zampini PetscBool compressed; 2874afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2875afb2bd1cSJunchao Zhang PetscInt nx,ny; 2876afb2bd1cSJunchao Zhang #endif 28776e111a19SKarl Rupp 28789ae82921SPaul Mullowney PetscFunctionBegin; 2879e8d2b73aSMark Adams if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2880e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2881afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2882d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2883e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2884e6e9a74fSStefano Zampini } 288534d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 288634d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2887e6e9a74fSStefano Zampini if (!trans) { 28889ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2889e8d2b73aSMark Adams if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2890e6e9a74fSStefano Zampini } else { 28911a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 2892e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2893e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2894e6e9a74fSStefano Zampini } else { 28951a2c6b5cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2896e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2897e6e9a74fSStefano Zampini } 2898e6e9a74fSStefano Zampini } 2899e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2900e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2901213423ffSJunchao Zhang 2902e6e9a74fSStefano Zampini try { 2903e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2904213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2905213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2906afb2bd1cSJunchao Zhang 290785ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2908e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2909afb2bd1cSJunchao Zhang /* z = A x + beta y. 2910afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2911afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2912afb2bd1cSJunchao Zhang */ 2913e6e9a74fSStefano Zampini xptr = xarray; 2914afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2915213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2916afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2917afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2918afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2919afb2bd1cSJunchao Zhang */ 2920afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2921afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2922afb2bd1cSJunchao Zhang nx = mat->num_cols; 2923afb2bd1cSJunchao Zhang ny = mat->num_rows; 2924afb2bd1cSJunchao Zhang } 2925afb2bd1cSJunchao Zhang #endif 2926e6e9a74fSStefano Zampini } else { 2927afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2928afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2929afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2930afb2bd1cSJunchao Zhang */ 2931afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2932e6e9a74fSStefano Zampini dptr = zarray; 2933e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2934afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2935e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2936a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2937e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2938e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2939e6e9a74fSStefano Zampini } 2940afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2941afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2942afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2943afb2bd1cSJunchao Zhang nx = mat->num_rows; 2944afb2bd1cSJunchao Zhang ny = mat->num_cols; 2945afb2bd1cSJunchao Zhang } 2946afb2bd1cSJunchao Zhang #endif 2947e6e9a74fSStefano Zampini } 29489ae82921SPaul Mullowney 2949afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2950aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2951afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2952afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2953afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2954ee7b52eaSHong Zhang cudaError_t cerr; 2955afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2956afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2957afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2958afb2bd1cSJunchao Zhang matstruct->matDescr, 2959afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2960afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2961afb2bd1cSJunchao Zhang cusparse_scalartype, 2962afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2963afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2964afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2965afb2bd1cSJunchao Zhang 2966afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2967afb2bd1cSJunchao Zhang } else { 2968afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2969afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2970afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2971afb2bd1cSJunchao Zhang } 2972afb2bd1cSJunchao Zhang 2973afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2974afb2bd1cSJunchao Zhang matstruct->alpha_one, 29751a2c6b5cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2976afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2977afb2bd1cSJunchao Zhang beta, 2978afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2979afb2bd1cSJunchao Zhang cusparse_scalartype, 2980afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2981afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2982afb2bd1cSJunchao Zhang #else 29837656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2984e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2985a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2986afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2987aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2988e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 298957d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2990afb2bd1cSJunchao Zhang #endif 2991aa372e3fSPaul Mullowney } else { 2992213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2993afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2994afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2995afb2bd1cSJunchao Zhang #else 2996301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2997e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2998afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2999e6e9a74fSStefano Zampini xptr, beta, 300057d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3001afb2bd1cSJunchao Zhang #endif 3002a65300a6SPaul Mullowney } 3003aa372e3fSPaul Mullowney } 3004958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3005aa372e3fSPaul Mullowney 3006e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3007213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3008213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3009213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3010e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3011213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 30127656d835SStefano Zampini } 3013213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3014c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 30157656d835SStefano Zampini } 30167656d835SStefano Zampini 3017213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3018213423ffSJunchao Zhang if (compressed) { 3019e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3020a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3021a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3022a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3023a0e72f99SJunchao Zhang */ 3024a0e72f99SJunchao Zhang #if 0 3025a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3026a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3027a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3028e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3029c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3030a0e72f99SJunchao Zhang #else 3031a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3032a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3033a0e72f99SJunchao Zhang #endif 3034958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3035e6e9a74fSStefano Zampini } 3036e6e9a74fSStefano Zampini } else { 3037e6e9a74fSStefano Zampini if (yy && yy != zz) { 3038e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3039e6e9a74fSStefano Zampini } 3040e6e9a74fSStefano Zampini } 3041e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3042213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3043213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 30449ae82921SPaul Mullowney } catch(char *ex) { 30459ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 30469ae82921SPaul Mullowney } 3047e6e9a74fSStefano Zampini if (yy) { 3048958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3049e6e9a74fSStefano Zampini } else { 3050e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3051e6e9a74fSStefano Zampini } 30529ae82921SPaul Mullowney PetscFunctionReturn(0); 30539ae82921SPaul Mullowney } 30549ae82921SPaul Mullowney 30556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3056ca45077fSPaul Mullowney { 3057b175d8bbSPaul Mullowney PetscErrorCode ierr; 30586e111a19SKarl Rupp 3059ca45077fSPaul Mullowney PetscFunctionBegin; 3060e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3061ca45077fSPaul Mullowney PetscFunctionReturn(0); 3062ca45077fSPaul Mullowney } 3063ca45077fSPaul Mullowney 30646fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 30659ae82921SPaul Mullowney { 30669ae82921SPaul Mullowney PetscErrorCode ierr; 3067042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3068042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 30693fa6b06aSMark Adams 3070042217e8SBarry Smith PetscFunctionBegin; 3071042217e8SBarry Smith ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3072042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3073042217e8SBarry Smith cudaError_t cerr; 3074042217e8SBarry Smith 3075042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3076042217e8SBarry Smith cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3077042217e8SBarry Smith cusp->deviceMat = NULL; 3078042217e8SBarry Smith } 30799ae82921SPaul Mullowney PetscFunctionReturn(0); 30809ae82921SPaul Mullowney } 30819ae82921SPaul Mullowney 30829ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3083e057df02SPaul Mullowney /*@ 30849ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3085e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3086e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3087e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3088e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3089e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 30909ae82921SPaul Mullowney 3091d083f849SBarry Smith Collective 30929ae82921SPaul Mullowney 30939ae82921SPaul Mullowney Input Parameters: 30949ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 30959ae82921SPaul Mullowney . m - number of rows 30969ae82921SPaul Mullowney . n - number of columns 30979ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 30989ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 30990298fd71SBarry Smith (possibly different for each row) or NULL 31009ae82921SPaul Mullowney 31019ae82921SPaul Mullowney Output Parameter: 31029ae82921SPaul Mullowney . A - the matrix 31039ae82921SPaul Mullowney 31049ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 31059ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 31069ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 31079ae82921SPaul Mullowney 31089ae82921SPaul Mullowney Notes: 31099ae82921SPaul Mullowney If nnz is given then nz is ignored 31109ae82921SPaul Mullowney 31119ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 31129ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 31139ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 31149ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 31159ae82921SPaul Mullowney 31169ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 31170298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 31189ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 31199ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 31209ae82921SPaul Mullowney 31219ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 31229ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 31239ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 31249ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 31259ae82921SPaul Mullowney 31269ae82921SPaul Mullowney Level: intermediate 31279ae82921SPaul Mullowney 3128e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 31299ae82921SPaul Mullowney @*/ 31309ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 31319ae82921SPaul Mullowney { 31329ae82921SPaul Mullowney PetscErrorCode ierr; 31339ae82921SPaul Mullowney 31349ae82921SPaul Mullowney PetscFunctionBegin; 31359ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 31369ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 31379ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 31389ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 31399ae82921SPaul Mullowney PetscFunctionReturn(0); 31409ae82921SPaul Mullowney } 31419ae82921SPaul Mullowney 31426fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 31439ae82921SPaul Mullowney { 31449ae82921SPaul Mullowney PetscErrorCode ierr; 3145ab25e6cbSDominic Meiser 31469ae82921SPaul Mullowney PetscFunctionBegin; 31479ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3148470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 31499ae82921SPaul Mullowney } else { 3150470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3151aa372e3fSPaul Mullowney } 3152c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3153ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3154ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3155ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3156fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3157ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 31587e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 31597e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 31609ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 31619ae82921SPaul Mullowney PetscFunctionReturn(0); 31629ae82921SPaul Mullowney } 31639ae82921SPaul Mullowney 3164ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 316595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 31669ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 31679ff858a8SKarl Rupp { 31689ff858a8SKarl Rupp PetscErrorCode ierr; 31699ff858a8SKarl Rupp 31709ff858a8SKarl Rupp PetscFunctionBegin; 31719ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3172ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 31739ff858a8SKarl Rupp PetscFunctionReturn(0); 31749ff858a8SKarl Rupp } 31759ff858a8SKarl Rupp 3176039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 317795639643SRichard Tran Mills { 3178e6e9a74fSStefano Zampini PetscErrorCode ierr; 3179a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3180039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3181039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3182039c6fbaSStefano Zampini PetscScalar *ay; 3183039c6fbaSStefano Zampini const PetscScalar *ax; 3184039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3185e6e9a74fSStefano Zampini 318695639643SRichard Tran Mills PetscFunctionBegin; 3187a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3188a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3189039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3190a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3191a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3192a587d139SMark PetscFunctionReturn(0); 319395639643SRichard Tran Mills } 3194039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3195a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3196a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3197e8d2b73aSMark Adams if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3198e8d2b73aSMark Adams if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3199039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3200039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3201039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3202039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3203039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3204039c6fbaSStefano Zampini if (eq) { 3205039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3206039c6fbaSStefano Zampini } 3207039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3208039c6fbaSStefano Zampini } 3209d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3210d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3211039c6fbaSStefano Zampini 3212039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3213039c6fbaSStefano Zampini cusparseStatus_t stat; 3214039c6fbaSStefano Zampini PetscScalar b = 1.0; 3215039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3216039c6fbaSStefano Zampini size_t bufferSize; 3217039c6fbaSStefano Zampini void *buffer; 3218ee7b52eaSHong Zhang cudaError_t cerr; 3219039c6fbaSStefano Zampini #endif 3220039c6fbaSStefano Zampini 3221039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3222039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3223039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3224039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3225039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3226039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3227039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3228039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3229039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3230039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3231039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3232039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3233039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3234039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3235039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3236039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3237039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3238039c6fbaSStefano Zampini #else 3239039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3240039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3241039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3242039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3243039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3244039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3245039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3246039c6fbaSStefano Zampini #endif 3247039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3248039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3249039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3250039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3251039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3252a587d139SMark cublasHandle_t cublasv2handle; 3253039c6fbaSStefano Zampini cublasStatus_t berr; 3254a587d139SMark PetscBLASInt one = 1, bnz = 1; 3255039c6fbaSStefano Zampini 3256039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3257039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3258a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3259a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3260a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3261039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3262a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3263a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3264039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3265039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3266a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3267039c6fbaSStefano Zampini } else { 3268a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3269d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3270a587d139SMark } 327195639643SRichard Tran Mills PetscFunctionReturn(0); 327295639643SRichard Tran Mills } 327395639643SRichard Tran Mills 327433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 327533c9ba73SStefano Zampini { 327633c9ba73SStefano Zampini PetscErrorCode ierr; 327733c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 327833c9ba73SStefano Zampini PetscScalar *ay; 327933c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 328033c9ba73SStefano Zampini cublasStatus_t berr; 328133c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 328233c9ba73SStefano Zampini 328333c9ba73SStefano Zampini PetscFunctionBegin; 328433c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 328533c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 328633c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 328733c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 328833c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 328933c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 329033c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 329133c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 329233c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 329333c9ba73SStefano Zampini PetscFunctionReturn(0); 329433c9ba73SStefano Zampini } 329533c9ba73SStefano Zampini 32963fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 32973fa6b06aSMark Adams { 32983fa6b06aSMark Adams PetscErrorCode ierr; 32997e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3300a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 33017e8381f9SStefano Zampini 33023fa6b06aSMark Adams PetscFunctionBegin; 33033fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 33043fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 33057e8381f9SStefano Zampini if (spptr->mat) { 33067e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 33077e8381f9SStefano Zampini if (matrix->values) { 33087e8381f9SStefano Zampini both = PETSC_TRUE; 33097e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 33107e8381f9SStefano Zampini } 33117e8381f9SStefano Zampini } 33127e8381f9SStefano Zampini if (spptr->matTranspose) { 33137e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 33147e8381f9SStefano Zampini if (matrix->values) { 33157e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 33167e8381f9SStefano Zampini } 33177e8381f9SStefano Zampini } 33183fa6b06aSMark Adams } 3319a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3320a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3321a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 33227e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3323a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 33243fa6b06aSMark Adams PetscFunctionReturn(0); 33253fa6b06aSMark Adams } 33263fa6b06aSMark Adams 3327a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3328a587d139SMark { 3329a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3330a587d139SMark PetscErrorCode ierr; 3331a587d139SMark 3332a587d139SMark PetscFunctionBegin; 3333a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3334a587d139SMark if (flg) { 3335a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3336a587d139SMark 333733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3338a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3339a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3340a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3341a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3342a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3343a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3344a587d139SMark A->ops->multhermitiantranspose = NULL; 3345a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3346fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3347c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3348a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3349a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3350a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3351a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3352a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3353fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3354a587d139SMark } else { 335533c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3356a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3357a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3358a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3359a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3360a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3361a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3362a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3363a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3364fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3365c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3366a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3367a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3368a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3369a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3370a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3371fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3372a587d139SMark } 3373a587d139SMark A->boundtocpu = flg; 3374a587d139SMark a->inode.use = flg; 3375a587d139SMark PetscFunctionReturn(0); 3376a587d139SMark } 3377a587d139SMark 337849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 33799ae82921SPaul Mullowney { 33809ae82921SPaul Mullowney PetscErrorCode ierr; 3381aa372e3fSPaul Mullowney cusparseStatus_t stat; 338249735bf3SStefano Zampini Mat B; 33839ae82921SPaul Mullowney 33849ae82921SPaul Mullowney PetscFunctionBegin; 3385832b2c02SStefano Zampini ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 338649735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 338749735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 338849735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 338949735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 339049735bf3SStefano Zampini } 339149735bf3SStefano Zampini B = *newmat; 339249735bf3SStefano Zampini 339334136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 339434136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 339534136279SStefano Zampini 339649735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 33979ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3398e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3399e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3400e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3401a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 34021a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3403d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3404d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3405d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3406d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3407d8132acaSStefano Zampini #endif 34081a2c6b5cSJunchao Zhang B->spptr = spptr; 34099ae82921SPaul Mullowney } else { 3410e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3411e6e9a74fSStefano Zampini 3412e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3413e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3414a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3415e6e9a74fSStefano Zampini B->spptr = spptr; 34169ae82921SPaul Mullowney } 3417e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 341849735bf3SStefano Zampini } 3419693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 34209ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 34211a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 34229ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 342395639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3424693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 34252205254eSKarl Rupp 3426e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 34279ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3428bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 34299ae82921SPaul Mullowney PetscFunctionReturn(0); 34309ae82921SPaul Mullowney } 34319ae82921SPaul Mullowney 343202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 343302fe1965SBarry Smith { 343402fe1965SBarry Smith PetscErrorCode ierr; 343502fe1965SBarry Smith 343602fe1965SBarry Smith PetscFunctionBegin; 343702fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 34380ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 343902fe1965SBarry Smith PetscFunctionReturn(0); 344002fe1965SBarry Smith } 344102fe1965SBarry Smith 34423ca39a21SBarry Smith /*MC 3443e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3444e057df02SPaul Mullowney 3445e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 34462692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 34472692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3448e057df02SPaul Mullowney 3449e057df02SPaul Mullowney Options Database Keys: 3450e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3451aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3452a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3453e057df02SPaul Mullowney 3454e057df02SPaul Mullowney Level: beginner 3455e057df02SPaul Mullowney 34568468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3457e057df02SPaul Mullowney M*/ 34587f756511SDominic Meiser 3459bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 34600f39cd5aSBarry Smith 34613ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 346242c9c57cSBarry Smith { 346342c9c57cSBarry Smith PetscErrorCode ierr; 346442c9c57cSBarry Smith 346542c9c57cSBarry Smith PetscFunctionBegin; 3466bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 34673ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 34683ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 34693ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 34703ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3471bddcd29dSMark Adams 347242c9c57cSBarry Smith PetscFunctionReturn(0); 347342c9c57cSBarry Smith } 347429b38603SBarry Smith 3475470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 34767f756511SDominic Meiser { 3477e6e9a74fSStefano Zampini PetscErrorCode ierr; 34787f756511SDominic Meiser cusparseStatus_t stat; 34797f756511SDominic Meiser 34807f756511SDominic Meiser PetscFunctionBegin; 34817f756511SDominic Meiser if (*cusparsestruct) { 3482e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3483e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 34847f756511SDominic Meiser delete (*cusparsestruct)->workVector; 348581902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 34867e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 34877e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3488a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 34897e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3490e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 34917f756511SDominic Meiser } 34927f756511SDominic Meiser PetscFunctionReturn(0); 34937f756511SDominic Meiser } 34947f756511SDominic Meiser 34957f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 34967f756511SDominic Meiser { 34977f756511SDominic Meiser PetscFunctionBegin; 34987f756511SDominic Meiser if (*mat) { 34997f756511SDominic Meiser delete (*mat)->values; 35007f756511SDominic Meiser delete (*mat)->column_indices; 35017f756511SDominic Meiser delete (*mat)->row_offsets; 35027f756511SDominic Meiser delete *mat; 35037f756511SDominic Meiser *mat = 0; 35047f756511SDominic Meiser } 35057f756511SDominic Meiser PetscFunctionReturn(0); 35067f756511SDominic Meiser } 35077f756511SDominic Meiser 3508470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 35097f756511SDominic Meiser { 35107f756511SDominic Meiser cusparseStatus_t stat; 35117f756511SDominic Meiser PetscErrorCode ierr; 35127f756511SDominic Meiser 35137f756511SDominic Meiser PetscFunctionBegin; 35147f756511SDominic Meiser if (*trifactor) { 351557d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3516afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 35177f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 35181b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 35192cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3520afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 35211b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3522afb2bd1cSJunchao Zhang #endif 3523da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 35247f756511SDominic Meiser } 35257f756511SDominic Meiser PetscFunctionReturn(0); 35267f756511SDominic Meiser } 35277f756511SDominic Meiser 3528470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 35297f756511SDominic Meiser { 35307f756511SDominic Meiser CsrMatrix *mat; 35317f756511SDominic Meiser cusparseStatus_t stat; 35327f756511SDominic Meiser cudaError_t err; 35337f756511SDominic Meiser 35347f756511SDominic Meiser PetscFunctionBegin; 35357f756511SDominic Meiser if (*matstruct) { 35367f756511SDominic Meiser if ((*matstruct)->mat) { 35377f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3538afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3539afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3540afb2bd1cSJunchao Zhang #else 35417f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 354257d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3543afb2bd1cSJunchao Zhang #endif 35447f756511SDominic Meiser } else { 35457f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 35467f756511SDominic Meiser CsrMatrix_Destroy(&mat); 35477f756511SDominic Meiser } 35487f756511SDominic Meiser } 354957d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 35507f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3551afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 35527656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 35537656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3554afb2bd1cSJunchao Zhang 3555afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3556afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3557afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3558afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3559afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3560afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3561afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3562afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3563afb2bd1cSJunchao Zhang } 3564afb2bd1cSJunchao Zhang } 3565afb2bd1cSJunchao Zhang #endif 35667f756511SDominic Meiser delete *matstruct; 35677e8381f9SStefano Zampini *matstruct = NULL; 35687f756511SDominic Meiser } 35697f756511SDominic Meiser PetscFunctionReturn(0); 35707f756511SDominic Meiser } 35717f756511SDominic Meiser 3572e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 35737f756511SDominic Meiser { 3574e6e9a74fSStefano Zampini PetscErrorCode ierr; 3575e6e9a74fSStefano Zampini 35767f756511SDominic Meiser PetscFunctionBegin; 35777f756511SDominic Meiser if (*trifactors) { 3578e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3579e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3580e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3581e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 35827f756511SDominic Meiser delete (*trifactors)->rpermIndices; 35837f756511SDominic Meiser delete (*trifactors)->cpermIndices; 35847f756511SDominic Meiser delete (*trifactors)->workVector; 35857e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 35867e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 35877e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3588bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3589bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3590e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3591ccdfe979SStefano Zampini } 3592ccdfe979SStefano Zampini PetscFunctionReturn(0); 3593ccdfe979SStefano Zampini } 3594ccdfe979SStefano Zampini 3595ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3596ccdfe979SStefano Zampini { 3597e6e9a74fSStefano Zampini PetscErrorCode ierr; 3598ccdfe979SStefano Zampini cusparseHandle_t handle; 3599ccdfe979SStefano Zampini cusparseStatus_t stat; 3600ccdfe979SStefano Zampini 3601ccdfe979SStefano Zampini PetscFunctionBegin; 3602ccdfe979SStefano Zampini if (*trifactors) { 3603e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 36047f756511SDominic Meiser if (handle = (*trifactors)->handle) { 360557d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 36067f756511SDominic Meiser } 3607e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 36087f756511SDominic Meiser } 36097f756511SDominic Meiser PetscFunctionReturn(0); 36107f756511SDominic Meiser } 36117e8381f9SStefano Zampini 36127e8381f9SStefano Zampini struct IJCompare 36137e8381f9SStefano Zampini { 36147e8381f9SStefano Zampini __host__ __device__ 36157e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 36167e8381f9SStefano Zampini { 36177e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 36187e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 36197e8381f9SStefano Zampini return false; 36207e8381f9SStefano Zampini } 36217e8381f9SStefano Zampini }; 36227e8381f9SStefano Zampini 36237e8381f9SStefano Zampini struct IJEqual 36247e8381f9SStefano Zampini { 36257e8381f9SStefano Zampini __host__ __device__ 36267e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 36277e8381f9SStefano Zampini { 36287e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 36297e8381f9SStefano Zampini return true; 36307e8381f9SStefano Zampini } 36317e8381f9SStefano Zampini }; 36327e8381f9SStefano Zampini 36337e8381f9SStefano Zampini struct IJDiff 36347e8381f9SStefano Zampini { 36357e8381f9SStefano Zampini __host__ __device__ 36367e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 36377e8381f9SStefano Zampini { 36387e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 36397e8381f9SStefano Zampini } 36407e8381f9SStefano Zampini }; 36417e8381f9SStefano Zampini 36427e8381f9SStefano Zampini struct IJSum 36437e8381f9SStefano Zampini { 36447e8381f9SStefano Zampini __host__ __device__ 36457e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 36467e8381f9SStefano Zampini { 36477e8381f9SStefano Zampini return t1||t2; 36487e8381f9SStefano Zampini } 36497e8381f9SStefano Zampini }; 36507e8381f9SStefano Zampini 36517e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3652e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 36537e8381f9SStefano Zampini { 36547e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3655fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3656bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 365708391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 36587e8381f9SStefano Zampini CsrMatrix *matrix; 36597e8381f9SStefano Zampini PetscErrorCode ierr; 36607e8381f9SStefano Zampini PetscInt n; 36617e8381f9SStefano Zampini 36627e8381f9SStefano Zampini PetscFunctionBegin; 36637e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 36647e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 36657e8381f9SStefano Zampini if (!cusp->cooPerm) { 36667e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 36677e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 36687e8381f9SStefano Zampini PetscFunctionReturn(0); 36697e8381f9SStefano Zampini } 36707e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 36717e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3672e61fc153SStefano Zampini if (!v) { 3673e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3674e61fc153SStefano Zampini goto finalize; 36757e8381f9SStefano Zampini } 3676e61fc153SStefano Zampini n = cusp->cooPerm->size(); 367708391a17SStefano Zampini if (isCudaMem(v)) { 367808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 367908391a17SStefano Zampini } else { 3680e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3681e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 368208391a17SStefano Zampini d_v = cooPerm_v->data(); 3683e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 368408391a17SStefano Zampini } 3685bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3686e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 36877e8381f9SStefano Zampini if (cusp->cooPerm_a) { 3688bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 368908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3690e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3691e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3692e61fc153SStefano Zampini delete cooPerm_w; 36937e8381f9SStefano Zampini } else { 369408391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 36957e8381f9SStefano Zampini matrix->values->begin())); 369608391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 36977e8381f9SStefano Zampini matrix->values->end())); 36987e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 36997e8381f9SStefano Zampini } 37007e8381f9SStefano Zampini } else { 3701e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 370208391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3703e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 37047e8381f9SStefano Zampini } else { 370508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 37067e8381f9SStefano Zampini matrix->values->begin())); 370708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 37087e8381f9SStefano Zampini matrix->values->end())); 37097e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 37107e8381f9SStefano Zampini } 37117e8381f9SStefano Zampini } 3712bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3713e61fc153SStefano Zampini finalize: 3714e61fc153SStefano Zampini delete cooPerm_v; 37157e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3716e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3717fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3718fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3719fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3720fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3721fcdce8c4SStefano Zampini a->reallocs = 0; 3722fcdce8c4SStefano Zampini A->info.mallocs += 0; 3723fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3724fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3725fcdce8c4SStefano Zampini A->num_ass++; 37267e8381f9SStefano Zampini PetscFunctionReturn(0); 37277e8381f9SStefano Zampini } 37287e8381f9SStefano Zampini 3729a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3730a49f1ed0SStefano Zampini { 3731a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3732a49f1ed0SStefano Zampini PetscErrorCode ierr; 3733a49f1ed0SStefano Zampini 3734a49f1ed0SStefano Zampini PetscFunctionBegin; 3735a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3736a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3737a49f1ed0SStefano Zampini if (destroy) { 3738a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3739a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3740a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3741a49f1ed0SStefano Zampini } 37421a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3743a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3744a49f1ed0SStefano Zampini } 3745a49f1ed0SStefano Zampini 37467e8381f9SStefano Zampini #include <thrust/binary_search.h> 3747e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 37487e8381f9SStefano Zampini { 37497e8381f9SStefano Zampini PetscErrorCode ierr; 37507e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 37517e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 37527e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 37537e8381f9SStefano Zampini cudaError_t cerr; 37547e8381f9SStefano Zampini 37557e8381f9SStefano Zampini PetscFunctionBegin; 37567e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 37577e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 37587e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 37597e8381f9SStefano Zampini if (n != cooPerm_n) { 37607e8381f9SStefano Zampini delete cusp->cooPerm; 37617e8381f9SStefano Zampini delete cusp->cooPerm_a; 37627e8381f9SStefano Zampini cusp->cooPerm = NULL; 37637e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 37647e8381f9SStefano Zampini } 37657e8381f9SStefano Zampini if (n) { 37667e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 37677e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 37687e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 37697e8381f9SStefano Zampini 37707e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 37717e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 37727e8381f9SStefano Zampini 37737e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 37747e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 37757e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 37767e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 37777e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 37787e8381f9SStefano Zampini 377908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 37807e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 37817e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 37827e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 37837e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 37847e8381f9SStefano Zampini 37857e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 37867e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 37877e8381f9SStefano Zampini delete cusp->cooPerm_a; 37887e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 37897e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 37907e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 37917e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 37927e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 37937e8381f9SStefano Zampini w[0] = 0; 37947e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 37957e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 37967e8381f9SStefano Zampini } 37977e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 37987e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 37997e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 38007e8381f9SStefano Zampini ii.begin()); 380108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 38027e8381f9SStefano Zampini 38037e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 38047e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 38057e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 38067e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 38077e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 38087e8381f9SStefano Zampini a->i[0] = 0; 38097e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 38107e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3811fcdce8c4SStefano Zampini a->rmax = 0; 38127e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 38137e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 38147e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 38157e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 38167e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 38177e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 38187e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 38197e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 38207e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3821fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 38227e8381f9SStefano Zampini } 3823fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 38247e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 38257e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3826fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 38277e8381f9SStefano Zampini } else { 38287e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 38297e8381f9SStefano Zampini } 3830e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 38317e8381f9SStefano Zampini 38327e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3833e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 3834e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 38357e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 38367e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 38377e8381f9SStefano Zampini A->nonzerostate++; 38387e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3839a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 38407e8381f9SStefano Zampini 38417e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 38427e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 38437e8381f9SStefano Zampini PetscFunctionReturn(0); 38447e8381f9SStefano Zampini } 3845ed502f03SStefano Zampini 3846ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3847ed502f03SStefano Zampini { 3848ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3849ed502f03SStefano Zampini CsrMatrix *csr; 3850ed502f03SStefano Zampini PetscErrorCode ierr; 3851ed502f03SStefano Zampini 3852ed502f03SStefano Zampini PetscFunctionBegin; 3853ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3854ed502f03SStefano Zampini PetscValidPointer(a,2); 3855ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3856ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3857ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 385833c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3859ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3860ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3861ed502f03SStefano Zampini *a = csr->values->data().get(); 3862ed502f03SStefano Zampini PetscFunctionReturn(0); 3863ed502f03SStefano Zampini } 3864ed502f03SStefano Zampini 3865ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3866ed502f03SStefano Zampini { 3867ed502f03SStefano Zampini PetscFunctionBegin; 3868ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3869ed502f03SStefano Zampini PetscValidPointer(a,2); 3870ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3871ed502f03SStefano Zampini *a = NULL; 3872ed502f03SStefano Zampini PetscFunctionReturn(0); 3873ed502f03SStefano Zampini } 3874ed502f03SStefano Zampini 3875039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3876039c6fbaSStefano Zampini { 3877039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3878039c6fbaSStefano Zampini CsrMatrix *csr; 3879039c6fbaSStefano Zampini PetscErrorCode ierr; 3880039c6fbaSStefano Zampini 3881039c6fbaSStefano Zampini PetscFunctionBegin; 3882039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3883039c6fbaSStefano Zampini PetscValidPointer(a,2); 3884039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3885039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3886039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 388733c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3888039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3889039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3890039c6fbaSStefano Zampini *a = csr->values->data().get(); 3891039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3892a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3893039c6fbaSStefano Zampini PetscFunctionReturn(0); 3894039c6fbaSStefano Zampini } 3895039c6fbaSStefano Zampini 3896039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3897039c6fbaSStefano Zampini { 3898039c6fbaSStefano Zampini PetscErrorCode ierr; 3899039c6fbaSStefano Zampini 3900039c6fbaSStefano Zampini PetscFunctionBegin; 3901039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3902039c6fbaSStefano Zampini PetscValidPointer(a,2); 3903039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3904039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3905039c6fbaSStefano Zampini *a = NULL; 3906039c6fbaSStefano Zampini PetscFunctionReturn(0); 3907039c6fbaSStefano Zampini } 3908039c6fbaSStefano Zampini 3909ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3910ed502f03SStefano Zampini { 3911ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3912ed502f03SStefano Zampini CsrMatrix *csr; 3913a49f1ed0SStefano Zampini PetscErrorCode ierr; 3914ed502f03SStefano Zampini 3915ed502f03SStefano Zampini PetscFunctionBegin; 3916ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3917ed502f03SStefano Zampini PetscValidPointer(a,2); 3918ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3919ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 392033c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3921ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3922ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3923ed502f03SStefano Zampini *a = csr->values->data().get(); 3924039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3925a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3926ed502f03SStefano Zampini PetscFunctionReturn(0); 3927ed502f03SStefano Zampini } 3928ed502f03SStefano Zampini 3929ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3930ed502f03SStefano Zampini { 3931ed502f03SStefano Zampini PetscErrorCode ierr; 3932ed502f03SStefano Zampini 3933ed502f03SStefano Zampini PetscFunctionBegin; 3934ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3935ed502f03SStefano Zampini PetscValidPointer(a,2); 3936ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3937ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3938ed502f03SStefano Zampini *a = NULL; 3939ed502f03SStefano Zampini PetscFunctionReturn(0); 3940ed502f03SStefano Zampini } 3941ed502f03SStefano Zampini 3942ed502f03SStefano Zampini struct IJCompare4 3943ed502f03SStefano Zampini { 3944ed502f03SStefano Zampini __host__ __device__ 39452ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3946ed502f03SStefano Zampini { 3947ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 3948ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3949ed502f03SStefano Zampini return false; 3950ed502f03SStefano Zampini } 3951ed502f03SStefano Zampini }; 3952ed502f03SStefano Zampini 39538909a122SStefano Zampini struct Shift 39548909a122SStefano Zampini { 3955ed502f03SStefano Zampini int _shift; 3956ed502f03SStefano Zampini 3957ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 3958ed502f03SStefano Zampini __host__ __device__ 3959ed502f03SStefano Zampini inline int operator() (const int &c) 3960ed502f03SStefano Zampini { 3961ed502f03SStefano Zampini return c + _shift; 3962ed502f03SStefano Zampini } 3963ed502f03SStefano Zampini }; 3964ed502f03SStefano Zampini 3965ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3966ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3967ed502f03SStefano Zampini { 3968ed502f03SStefano Zampini PetscErrorCode ierr; 3969ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3970ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3971ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3972ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3973ed502f03SStefano Zampini PetscInt Annz,Bnnz; 3974ed502f03SStefano Zampini cusparseStatus_t stat; 3975ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 3976ed502f03SStefano Zampini cudaError_t cerr; 3977ed502f03SStefano Zampini 3978ed502f03SStefano Zampini PetscFunctionBegin; 3979ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3980ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3981ed502f03SStefano Zampini PetscValidPointer(C,4); 3982ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3983ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3984ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3985ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3986ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3987ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3988ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 3989ed502f03SStefano Zampini m = A->rmap->n; 3990ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 3991ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3992ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3993ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3994ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 3995ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3996ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3997ed502f03SStefano Zampini Ccsr = new CsrMatrix; 3998ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 3999ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4000ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4001ed502f03SStefano Zampini c->compressedrow.i = NULL; 4002ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4003ed502f03SStefano Zampini Ccusp->workVector = NULL; 4004ed502f03SStefano Zampini Ccusp->nrows = m; 4005ed502f03SStefano Zampini Ccusp->mat = Cmat; 4006ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4007ed502f03SStefano Zampini Ccsr->num_rows = m; 4008ed502f03SStefano Zampini Ccsr->num_cols = n; 4009ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4010ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4011ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4012ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4013ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4014ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4015ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4016ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4017ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4018ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4019ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 40201a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 40211a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 4022ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4023ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4024ed502f03SStefano Zampini 4025ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4026ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4027ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4028ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4029ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4030ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4031ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4032ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4033ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4034ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4035ed502f03SStefano Zampini if (c->nz) { 40362ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 40372ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 40382ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 40392ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 40402ed87e7eSStefano Zampini 4041ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4042ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4043ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4044ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4045ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4046ed502f03SStefano Zampini } 40472ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 40482ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4049ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4050ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4051ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4052ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4053ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4054ed502f03SStefano Zampini } 40552ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 40562ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 4057ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 40582ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 40592ed87e7eSStefano Zampini Aroff->data().get(), 40602ed87e7eSStefano Zampini Annz, 40612ed87e7eSStefano Zampini m, 40622ed87e7eSStefano Zampini Acoo->data().get(), 40632ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4064ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 40652ed87e7eSStefano Zampini Broff->data().get(), 4066ed502f03SStefano Zampini Bnnz, 4067ed502f03SStefano Zampini m, 40682ed87e7eSStefano Zampini Bcoo->data().get(), 4069ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 40702ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 40712ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 40722ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 40738909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4074ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4075ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 40768909a122SStefano Zampini #else 40778909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 40788909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 40798909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 40808909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 40818909a122SStefano Zampini #endif 40822ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 40832ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 40842ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 40852ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 40862ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 40872ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4088ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4089ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4090ed502f03SStefano Zampini thrust::advance(p2,Annz); 40912ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 40928909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 40938909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 40948909a122SStefano Zampini #endif 40952ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 40962ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 40972ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 40982ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 40992ed87e7eSStefano Zampini #else 41002ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 41012ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 41022ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 41032ed87e7eSStefano Zampini #endif 4104ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 41052ed87e7eSStefano Zampini Ccoo->data().get(), 4106ed502f03SStefano Zampini c->nz, 4107ed502f03SStefano Zampini m, 4108ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4109ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4110ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 41112ed87e7eSStefano Zampini delete wPerm; 41122ed87e7eSStefano Zampini delete Acoo; 41132ed87e7eSStefano Zampini delete Bcoo; 41142ed87e7eSStefano Zampini delete Ccoo; 4115ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4116ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4117ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4118ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4119ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4120ed502f03SStefano Zampini #endif 41211a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4122ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4123ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4124ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4125ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4126ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4127ed502f03SStefano Zampini 41281a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 41291a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4130a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4131ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4132ed502f03SStefano Zampini CmatT->mat = CcsrT; 4133ed502f03SStefano Zampini CcsrT->num_rows = n; 4134ed502f03SStefano Zampini CcsrT->num_cols = m; 4135ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4136ed502f03SStefano Zampini 4137ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4138ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4139ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4140ed502f03SStefano Zampini 4141ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4142ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4143ed502f03SStefano Zampini if (AT) { 4144ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4145ed502f03SStefano Zampini thrust::advance(rT,-1); 4146ed502f03SStefano Zampini } 4147ed502f03SStefano Zampini if (BT) { 4148ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4149ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4150ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4151ed502f03SStefano Zampini } 4152ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4153ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4154ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4155ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4156ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4157ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4158ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4159ed502f03SStefano Zampini 4160ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4161ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4162ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4163ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4164ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4165ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4166ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4167ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4168ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4169ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4170ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4171ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4172ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4173ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4174ed502f03SStefano Zampini #endif 4175ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4176ed502f03SStefano Zampini } 4177ed502f03SStefano Zampini } 4178ed502f03SStefano Zampini 4179ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4180ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4181ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4182ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4183ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4184ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4185ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4186ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4187ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4188ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4189ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4190ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4191ed502f03SStefano Zampini } else { 4192ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4193ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4194ed502f03SStefano Zampini } 4195ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4196ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4197ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4198ed502f03SStefano Zampini c->maxnz = c->nz; 4199ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4200ed502f03SStefano Zampini c->rmax = 0; 4201ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4202ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4203ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4204ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4205ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4206ed502f03SStefano Zampini } 4207ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4208ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4209ed502f03SStefano Zampini (*C)->nonzerostate++; 4210ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4211ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4212ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4213ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4214ed502f03SStefano Zampini } else { 4215ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4216ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4217ed502f03SStefano Zampini if (c->nz) { 4218ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4219ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4220ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4221ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4222ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4223ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4224ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4225ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4226ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4227ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4228ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4229ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4230ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4231ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4232ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4233ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4234ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4235ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4236ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4237ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4238ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4239ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4240ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4241ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4242ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4243ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4244ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4245ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4246ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4247a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 42481a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4249ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4250ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4251ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4252ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4253ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4254ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4255ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4256ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 42571a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4258ed502f03SStefano Zampini } 4259ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4260ed502f03SStefano Zampini } 4261ed502f03SStefano Zampini } 4262ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4263ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4264ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4265ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4266ed502f03SStefano Zampini PetscFunctionReturn(0); 4267ed502f03SStefano Zampini } 4268c215019aSStefano Zampini 4269c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4270c215019aSStefano Zampini { 4271c215019aSStefano Zampini PetscErrorCode ierr; 4272c215019aSStefano Zampini bool dmem; 4273c215019aSStefano Zampini const PetscScalar *av; 4274c215019aSStefano Zampini cudaError_t cerr; 4275c215019aSStefano Zampini 4276c215019aSStefano Zampini PetscFunctionBegin; 4277c215019aSStefano Zampini dmem = isCudaMem(v); 4278c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4279c215019aSStefano Zampini if (n && idx) { 4280c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4281c215019aSStefano Zampini widx.assign(idx,idx+n); 4282c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4283c215019aSStefano Zampini 4284c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4285c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4286c215019aSStefano Zampini if (dmem) { 4287c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4288c215019aSStefano Zampini } else { 4289c215019aSStefano Zampini w = new THRUSTARRAY(n); 4290c215019aSStefano Zampini dv = w->data(); 4291c215019aSStefano Zampini } 4292c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4293c215019aSStefano Zampini 4294c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4295c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4296c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4297c215019aSStefano Zampini if (w) { 4298c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4299c215019aSStefano Zampini } 4300c215019aSStefano Zampini delete w; 4301c215019aSStefano Zampini } else { 4302c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4303c215019aSStefano Zampini } 4304c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4305c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4306c215019aSStefano Zampini PetscFunctionReturn(0); 4307c215019aSStefano Zampini } 4308