19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 16e8d2b73aSMark Adams 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 769ae82921SPaul Mullowney 777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 827f756511SDominic Meiser 83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 8657181aedSStefano Zampini 877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 897e8381f9SStefano Zampini 90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91c215019aSStefano Zampini 92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93b06137fdSPaul Mullowney { 94b06137fdSPaul Mullowney cusparseStatus_t stat; 95b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96b06137fdSPaul Mullowney 97b06137fdSPaul Mullowney PetscFunctionBegin; 98d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10057d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101b06137fdSPaul Mullowney PetscFunctionReturn(0); 102b06137fdSPaul Mullowney } 103b06137fdSPaul Mullowney 104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105b06137fdSPaul Mullowney { 106b06137fdSPaul Mullowney cusparseStatus_t stat; 107b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108b06137fdSPaul Mullowney 109b06137fdSPaul Mullowney PetscFunctionBegin; 110d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1116b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11216a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11357d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11416a2e217SAlejandro Lamas Daviña } 115b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1166b1cf21dSAlejandro Lamas Daviña } 11757d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118b06137fdSPaul Mullowney PetscFunctionReturn(0); 119b06137fdSPaul Mullowney } 120b06137fdSPaul Mullowney 121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122b06137fdSPaul Mullowney { 123b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1247e8381f9SStefano Zampini PetscBool flg; 1257e8381f9SStefano Zampini PetscErrorCode ierr; 126ccdfe979SStefano Zampini 127b06137fdSPaul Mullowney PetscFunctionBegin; 1287e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1297e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 131b06137fdSPaul Mullowney PetscFunctionReturn(0); 132b06137fdSPaul Mullowney } 133b06137fdSPaul Mullowney 134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1359ae82921SPaul Mullowney { 1369ae82921SPaul Mullowney PetscFunctionBegin; 1379ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1389ae82921SPaul Mullowney PetscFunctionReturn(0); 1399ae82921SPaul Mullowney } 1409ae82921SPaul Mullowney 141c708e6cdSJed Brown /*MC 142087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 148c708e6cdSJed Brown 1499ae82921SPaul Mullowney Level: beginner 150c708e6cdSJed Brown 1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152c708e6cdSJed Brown M*/ 1539ae82921SPaul Mullowney 15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1559ae82921SPaul Mullowney { 1569ae82921SPaul Mullowney PetscErrorCode ierr; 157bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1589ae82921SPaul Mullowney 1599ae82921SPaul Mullowney PetscFunctionBegin; 160bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1622c7c0729SBarry Smith (*B)->factortype = ftype; 1639ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1642205254eSKarl Rupp 165087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16633d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1679ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1689ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1694ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1704ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 1714ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1754ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 1764ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 1779ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178bc3f50f2SPaul Mullowney 179fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1804ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1813ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1829ae82921SPaul Mullowney PetscFunctionReturn(0); 1839ae82921SPaul Mullowney } 1849ae82921SPaul Mullowney 185bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186ca45077fSPaul Mullowney { 187aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1886e111a19SKarl Rupp 189ca45077fSPaul Mullowney PetscFunctionBegin; 190ca45077fSPaul Mullowney switch (op) { 191e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 192aa372e3fSPaul Mullowney cusparsestruct->format = format; 193ca45077fSPaul Mullowney break; 194e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 195aa372e3fSPaul Mullowney cusparsestruct->format = format; 196ca45077fSPaul Mullowney break; 197ca45077fSPaul Mullowney default: 19836d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199ca45077fSPaul Mullowney } 200ca45077fSPaul Mullowney PetscFunctionReturn(0); 201ca45077fSPaul Mullowney } 2029ae82921SPaul Mullowney 203e057df02SPaul Mullowney /*@ 204e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 206aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 207e057df02SPaul Mullowney Not Collective 208e057df02SPaul Mullowney 209e057df02SPaul Mullowney Input Parameters: 2108468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 21136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2122692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213e057df02SPaul Mullowney 214e057df02SPaul Mullowney Output Parameter: 215e057df02SPaul Mullowney 216e057df02SPaul Mullowney Level: intermediate 217e057df02SPaul Mullowney 2188468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219e057df02SPaul Mullowney @*/ 220e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221e057df02SPaul Mullowney { 222e057df02SPaul Mullowney PetscErrorCode ierr; 2236e111a19SKarl Rupp 224e057df02SPaul Mullowney PetscFunctionBegin; 225e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227e057df02SPaul Mullowney PetscFunctionReturn(0); 228e057df02SPaul Mullowney } 229e057df02SPaul Mullowney 2301a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231e6e9a74fSStefano Zampini { 232e6e9a74fSStefano Zampini PetscErrorCode ierr; 233e6e9a74fSStefano Zampini 234e6e9a74fSStefano Zampini PetscFunctionBegin; 2351a2c6b5cSJunchao Zhang switch (op) { 2361a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2371a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2381a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2391a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2401a2c6b5cSJunchao Zhang break; 2411a2c6b5cSJunchao Zhang default: 2421a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2431a2c6b5cSJunchao Zhang break; 244e6e9a74fSStefano Zampini } 245e6e9a74fSStefano Zampini PetscFunctionReturn(0); 246e6e9a74fSStefano Zampini } 247e6e9a74fSStefano Zampini 248bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249bddcd29dSMark Adams 250bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251bddcd29dSMark Adams { 252bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 254bddcd29dSMark Adams PetscBool row_identity,col_identity; 255bddcd29dSMark Adams PetscErrorCode ierr; 256bddcd29dSMark Adams 257bddcd29dSMark Adams PetscFunctionBegin; 258bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 261bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 262bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264bddcd29dSMark Adams if (row_identity && col_identity) { 265bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267bddcd29dSMark Adams B->ops->matsolve = NULL; 268bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 269bddcd29dSMark Adams } else { 270bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272bddcd29dSMark Adams B->ops->matsolve = NULL; 273bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 274bddcd29dSMark Adams } 275bddcd29dSMark Adams 276bddcd29dSMark Adams /* get the triangular factors */ 277bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278bddcd29dSMark Adams PetscFunctionReturn(0); 279bddcd29dSMark Adams } 280bddcd29dSMark Adams 2814416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2829ae82921SPaul Mullowney { 2839ae82921SPaul Mullowney PetscErrorCode ierr; 284e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2859ae82921SPaul Mullowney PetscBool flg; 286a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2876e111a19SKarl Rupp 2889ae82921SPaul Mullowney PetscFunctionBegin; 289e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2909ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 291e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294afb2bd1cSJunchao Zhang 2954c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 303a435da06SStefano Zampini if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304a435da06SStefano Zampini #else 305afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 306a435da06SStefano Zampini #endif 307afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 308afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 309afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 310afb2bd1cSJunchao Zhang 311afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 312afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 313afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 314afb2bd1cSJunchao Zhang #endif 3154c87dfd4SPaul Mullowney } 3160af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3179ae82921SPaul Mullowney PetscFunctionReturn(0); 3189ae82921SPaul Mullowney } 3199ae82921SPaul Mullowney 3206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3219ae82921SPaul Mullowney { 322da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3239ae82921SPaul Mullowney PetscErrorCode ierr; 3249ae82921SPaul Mullowney 3259ae82921SPaul Mullowney PetscFunctionBegin; 326da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3279ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3289ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3299ae82921SPaul Mullowney PetscFunctionReturn(0); 3309ae82921SPaul Mullowney } 3319ae82921SPaul Mullowney 3326fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3339ae82921SPaul Mullowney { 334da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3359ae82921SPaul Mullowney PetscErrorCode ierr; 3369ae82921SPaul Mullowney 3379ae82921SPaul Mullowney PetscFunctionBegin; 338da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3399ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3409ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3419ae82921SPaul Mullowney PetscFunctionReturn(0); 3429ae82921SPaul Mullowney } 3439ae82921SPaul Mullowney 344087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 345087f3262SPaul Mullowney { 346da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 347087f3262SPaul Mullowney PetscErrorCode ierr; 348087f3262SPaul Mullowney 349087f3262SPaul Mullowney PetscFunctionBegin; 350da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 351087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 352087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 353087f3262SPaul Mullowney PetscFunctionReturn(0); 354087f3262SPaul Mullowney } 355087f3262SPaul Mullowney 356087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 357087f3262SPaul Mullowney { 358da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 359087f3262SPaul Mullowney PetscErrorCode ierr; 360087f3262SPaul Mullowney 361087f3262SPaul Mullowney PetscFunctionBegin; 362da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 363087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 364087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 365087f3262SPaul Mullowney PetscFunctionReturn(0); 366087f3262SPaul Mullowney } 367087f3262SPaul Mullowney 368087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3699ae82921SPaul Mullowney { 3709ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3719ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3729ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 373aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3749ae82921SPaul Mullowney cusparseStatus_t stat; 3759ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3769ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3779ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3789ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 379b175d8bbSPaul Mullowney PetscErrorCode ierr; 38057d48284SJunchao Zhang cudaError_t cerr; 3819ae82921SPaul Mullowney 3829ae82921SPaul Mullowney PetscFunctionBegin; 383cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 384c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3859ae82921SPaul Mullowney try { 3869ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3879ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 388da79fbbcSStefano Zampini if (!loTriFactor) { 3892cbc15d9SMark PetscScalar *AALo; 3902cbc15d9SMark 3912cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3929ae82921SPaul Mullowney 3939ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 39457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 39557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3969ae82921SPaul Mullowney 3979ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3989ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3999ae82921SPaul Mullowney AiLo[n] = nzLower; 4009ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4019ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4029ae82921SPaul Mullowney v = aa; 4039ae82921SPaul Mullowney vi = aj; 4049ae82921SPaul Mullowney offset = 1; 4059ae82921SPaul Mullowney rowOffset= 1; 4069ae82921SPaul Mullowney for (i=1; i<n; i++) { 4079ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 408e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4099ae82921SPaul Mullowney AiLo[i] = rowOffset; 4109ae82921SPaul Mullowney rowOffset += nz+1; 4119ae82921SPaul Mullowney 412580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 413580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4149ae82921SPaul Mullowney 4159ae82921SPaul Mullowney offset += nz; 4169ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4179ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4189ae82921SPaul Mullowney offset += 1; 4199ae82921SPaul Mullowney 4209ae82921SPaul Mullowney v += nz; 4219ae82921SPaul Mullowney vi += nz; 4229ae82921SPaul Mullowney } 4232205254eSKarl Rupp 424aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 425da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 426da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 427aa372e3fSPaul Mullowney /* Create the matrix description */ 42857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 42957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 431afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 432afb2bd1cSJunchao Zhang #else 43357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 434afb2bd1cSJunchao Zhang #endif 43557d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 43657d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 437aa372e3fSPaul Mullowney 438aa372e3fSPaul Mullowney /* set the operation */ 439aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 440aa372e3fSPaul Mullowney 441aa372e3fSPaul Mullowney /* set the matrix */ 442aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 443aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 444aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 445aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 446aa372e3fSPaul Mullowney 447aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 448aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 449aa372e3fSPaul Mullowney 450aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 451aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 452aa372e3fSPaul Mullowney 453aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 454aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 455aa372e3fSPaul Mullowney 456afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 457da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 460afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 461afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 462afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 463afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 464afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 465afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 466afb2bd1cSJunchao Zhang #endif 467afb2bd1cSJunchao Zhang 468aa372e3fSPaul Mullowney /* perform the solve analysis */ 469aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 470aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 471aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 472d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 4731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 474d49cd2b7SBarry Smith loTriFactor->solveInfo, 475d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 476d49cd2b7SBarry Smith #else 477d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 478afb2bd1cSJunchao Zhang #endif 479da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 480da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 481aa372e3fSPaul Mullowney 482da79fbbcSStefano Zampini /* assign the pointer */ 483aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4842cbc15d9SMark loTriFactor->AA_h = AALo; 48557d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 48657d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4874863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 488da79fbbcSStefano Zampini } else { /* update values only */ 4892cbc15d9SMark if (!loTriFactor->AA_h) { 4902cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4912cbc15d9SMark } 492da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4932cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 494da79fbbcSStefano Zampini v = aa; 495da79fbbcSStefano Zampini vi = aj; 496da79fbbcSStefano Zampini offset = 1; 497da79fbbcSStefano Zampini for (i=1; i<n; i++) { 498da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4992cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 500da79fbbcSStefano Zampini offset += nz; 5012cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 502da79fbbcSStefano Zampini offset += 1; 503da79fbbcSStefano Zampini v += nz; 504da79fbbcSStefano Zampini } 5052cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 506da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 507da79fbbcSStefano Zampini } 5089ae82921SPaul Mullowney } catch(char *ex) { 5099ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5109ae82921SPaul Mullowney } 5119ae82921SPaul Mullowney } 5129ae82921SPaul Mullowney PetscFunctionReturn(0); 5139ae82921SPaul Mullowney } 5149ae82921SPaul Mullowney 515087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5169ae82921SPaul Mullowney { 5179ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5189ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5199ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 520aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5219ae82921SPaul Mullowney cusparseStatus_t stat; 5229ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5239ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5249ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5259ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5269ae82921SPaul Mullowney PetscErrorCode ierr; 52757d48284SJunchao Zhang cudaError_t cerr; 5289ae82921SPaul Mullowney 5299ae82921SPaul Mullowney PetscFunctionBegin; 530cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 531c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5329ae82921SPaul Mullowney try { 5339ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5349ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 535da79fbbcSStefano Zampini if (!upTriFactor) { 5362cbc15d9SMark PetscScalar *AAUp; 5372cbc15d9SMark 5382cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5392cbc15d9SMark 5409ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 54157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 54257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5439ae82921SPaul Mullowney 5449ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5459ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5469ae82921SPaul Mullowney AiUp[n]=nzUpper; 5479ae82921SPaul Mullowney offset = nzUpper; 5489ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5499ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5509ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5519ae82921SPaul Mullowney 552e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5539ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5549ae82921SPaul Mullowney 555e057df02SPaul Mullowney /* decrement the offset */ 5569ae82921SPaul Mullowney offset -= (nz+1); 5579ae82921SPaul Mullowney 558e057df02SPaul Mullowney /* first, set the diagonal elements */ 5599ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 56009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5619ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5629ae82921SPaul Mullowney 563580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 564580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5659ae82921SPaul Mullowney } 5662205254eSKarl Rupp 567aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 568da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 569da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5702205254eSKarl Rupp 571aa372e3fSPaul Mullowney /* Create the matrix description */ 57257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 57357d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 576afb2bd1cSJunchao Zhang #else 57757d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 578afb2bd1cSJunchao Zhang #endif 57957d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 58057d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 581aa372e3fSPaul Mullowney 582aa372e3fSPaul Mullowney /* set the operation */ 583aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584aa372e3fSPaul Mullowney 585aa372e3fSPaul Mullowney /* set the matrix */ 586aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 587aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 588aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 589aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 590aa372e3fSPaul Mullowney 591aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593aa372e3fSPaul Mullowney 594aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596aa372e3fSPaul Mullowney 597aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599aa372e3fSPaul Mullowney 600afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 601da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 6031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 609afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 610afb2bd1cSJunchao Zhang #endif 611afb2bd1cSJunchao Zhang 612aa372e3fSPaul Mullowney /* perform the solve analysis */ 613aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618d49cd2b7SBarry Smith upTriFactor->solveInfo, 619d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 620d49cd2b7SBarry Smith #else 621d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 622afb2bd1cSJunchao Zhang #endif 623da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 624da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 625aa372e3fSPaul Mullowney 626da79fbbcSStefano Zampini /* assign the pointer */ 627aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6282cbc15d9SMark upTriFactor->AA_h = AAUp; 62957d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 63057d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6314863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 632da79fbbcSStefano Zampini } else { 6332cbc15d9SMark if (!upTriFactor->AA_h) { 6342cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6352cbc15d9SMark } 636da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 637da79fbbcSStefano Zampini offset = nzUpper; 638da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 639da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 640da79fbbcSStefano Zampini 641da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 642da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 643da79fbbcSStefano Zampini 644da79fbbcSStefano Zampini /* decrement the offset */ 645da79fbbcSStefano Zampini offset -= (nz+1); 646da79fbbcSStefano Zampini 647da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6482cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6492cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 650da79fbbcSStefano Zampini } 6512cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 653da79fbbcSStefano Zampini } 6549ae82921SPaul Mullowney } catch(char *ex) { 6559ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6569ae82921SPaul Mullowney } 6579ae82921SPaul Mullowney } 6589ae82921SPaul Mullowney PetscFunctionReturn(0); 6599ae82921SPaul Mullowney } 6609ae82921SPaul Mullowney 661087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6629ae82921SPaul Mullowney { 6639ae82921SPaul Mullowney PetscErrorCode ierr; 6649ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6659ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6669ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6679ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6689ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6699ae82921SPaul Mullowney 6709ae82921SPaul Mullowney PetscFunctionBegin; 671da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 672087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 673087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6742205254eSKarl Rupp 675da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 676aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6779ae82921SPaul Mullowney 678c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 679e057df02SPaul Mullowney /* lower triangular indices */ 6809ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 681da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 682da79fbbcSStefano Zampini const PetscInt *r; 683da79fbbcSStefano Zampini 684da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 685aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 686aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6879ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 688da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 689da79fbbcSStefano Zampini } 6909ae82921SPaul Mullowney 691e057df02SPaul Mullowney /* upper triangular indices */ 6929ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 693da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 694da79fbbcSStefano Zampini const PetscInt *c; 695da79fbbcSStefano Zampini 696da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 697aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6999ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 700da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 701da79fbbcSStefano Zampini } 7029ae82921SPaul Mullowney PetscFunctionReturn(0); 7039ae82921SPaul Mullowney } 7049ae82921SPaul Mullowney 705087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 706087f3262SPaul Mullowney { 707087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 708087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 709aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 710aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 711087f3262SPaul Mullowney cusparseStatus_t stat; 712087f3262SPaul Mullowney PetscErrorCode ierr; 71357d48284SJunchao Zhang cudaError_t cerr; 714087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 715087f3262SPaul Mullowney PetscScalar *AAUp; 716087f3262SPaul Mullowney PetscScalar *AALo; 717087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 718087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 719087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 720087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 721087f3262SPaul Mullowney 722087f3262SPaul Mullowney PetscFunctionBegin; 723cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 724c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 725087f3262SPaul Mullowney try { 726da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 727da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 728da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 729087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 73057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 73157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 732087f3262SPaul Mullowney 733087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 734087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 735087f3262SPaul Mullowney AiUp[n]=nzUpper; 736087f3262SPaul Mullowney offset = 0; 737087f3262SPaul Mullowney for (i=0; i<n; i++) { 738087f3262SPaul Mullowney /* set the pointers */ 739087f3262SPaul Mullowney v = aa + ai[i]; 740087f3262SPaul Mullowney vj = aj + ai[i]; 741087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 742087f3262SPaul Mullowney 743087f3262SPaul Mullowney /* first, set the diagonal elements */ 744087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 74509f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 746087f3262SPaul Mullowney AiUp[i] = offset; 74709f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 748087f3262SPaul Mullowney 749087f3262SPaul Mullowney offset+=1; 750087f3262SPaul Mullowney if (nz>0) { 751f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 752580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 753087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 754087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 755087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 756087f3262SPaul Mullowney } 757087f3262SPaul Mullowney offset+=nz; 758087f3262SPaul Mullowney } 759087f3262SPaul Mullowney } 760087f3262SPaul Mullowney 761aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 762da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 763da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 764087f3262SPaul Mullowney 765aa372e3fSPaul Mullowney /* Create the matrix description */ 76657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 76757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 770afb2bd1cSJunchao Zhang #else 77157d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 772afb2bd1cSJunchao Zhang #endif 77357d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 77457d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 775087f3262SPaul Mullowney 776aa372e3fSPaul Mullowney /* set the matrix */ 777aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 778aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 779aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 780aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 781aa372e3fSPaul Mullowney 782aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 783aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 784aa372e3fSPaul Mullowney 785aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 786aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 787aa372e3fSPaul Mullowney 788aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 789aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 790aa372e3fSPaul Mullowney 791afb2bd1cSJunchao Zhang /* set the operation */ 792afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 793afb2bd1cSJunchao Zhang 794afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 795da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 796afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7971b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 798afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 799afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 800afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 801afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 802afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 803afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 804afb2bd1cSJunchao Zhang #endif 805afb2bd1cSJunchao Zhang 806aa372e3fSPaul Mullowney /* perform the solve analysis */ 807aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 808aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 809aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 810d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 812d49cd2b7SBarry Smith upTriFactor->solveInfo, 813d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 814d49cd2b7SBarry Smith #else 815d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 816afb2bd1cSJunchao Zhang #endif 817da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 818da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 819aa372e3fSPaul Mullowney 820da79fbbcSStefano Zampini /* assign the pointer */ 821aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 822aa372e3fSPaul Mullowney 823aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 824da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 825da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826aa372e3fSPaul Mullowney 827aa372e3fSPaul Mullowney /* Create the matrix description */ 82857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 82957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832afb2bd1cSJunchao Zhang #else 83357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834afb2bd1cSJunchao Zhang #endif 83557d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 83657d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 837aa372e3fSPaul Mullowney 838aa372e3fSPaul Mullowney /* set the operation */ 839aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 840aa372e3fSPaul Mullowney 841aa372e3fSPaul Mullowney /* set the matrix */ 842aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 843aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 844aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 845aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 846aa372e3fSPaul Mullowney 847aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 848aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 849aa372e3fSPaul Mullowney 850aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 852aa372e3fSPaul Mullowney 853aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 855aa372e3fSPaul Mullowney 856afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 857da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 861afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 862afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 863afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 864afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866afb2bd1cSJunchao Zhang #endif 867afb2bd1cSJunchao Zhang 868aa372e3fSPaul Mullowney /* perform the solve analysis */ 869aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 870aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 871aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 872d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 8731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874d49cd2b7SBarry Smith loTriFactor->solveInfo, 875d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876d49cd2b7SBarry Smith #else 877d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878afb2bd1cSJunchao Zhang #endif 879da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 880da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881aa372e3fSPaul Mullowney 882da79fbbcSStefano Zampini /* assign the pointer */ 883aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 884087f3262SPaul Mullowney 885da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 88657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 88757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 888da79fbbcSStefano Zampini } else { 889da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 890da79fbbcSStefano Zampini offset = 0; 891da79fbbcSStefano Zampini for (i=0; i<n; i++) { 892da79fbbcSStefano Zampini /* set the pointers */ 893da79fbbcSStefano Zampini v = aa + ai[i]; 894da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 895da79fbbcSStefano Zampini 896da79fbbcSStefano Zampini /* first, set the diagonal elements */ 897da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 898da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 899da79fbbcSStefano Zampini 900da79fbbcSStefano Zampini offset+=1; 901da79fbbcSStefano Zampini if (nz>0) { 902da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 903da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 904da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 905da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 906da79fbbcSStefano Zampini } 907da79fbbcSStefano Zampini offset+=nz; 908da79fbbcSStefano Zampini } 909da79fbbcSStefano Zampini } 910da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 911da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 912da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 913da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 914da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 915da79fbbcSStefano Zampini } 91657d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 91757d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 918087f3262SPaul Mullowney } catch(char *ex) { 919087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 920087f3262SPaul Mullowney } 921087f3262SPaul Mullowney } 922087f3262SPaul Mullowney PetscFunctionReturn(0); 923087f3262SPaul Mullowney } 924087f3262SPaul Mullowney 925087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9269ae82921SPaul Mullowney { 9279ae82921SPaul Mullowney PetscErrorCode ierr; 928087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 929087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 930087f3262SPaul Mullowney IS ip = a->row; 931087f3262SPaul Mullowney PetscBool perm_identity; 932087f3262SPaul Mullowney PetscInt n = A->rmap->n; 933087f3262SPaul Mullowney 934087f3262SPaul Mullowney PetscFunctionBegin; 935da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 936087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 937da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 938aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 939aa372e3fSPaul Mullowney 940da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 941da79fbbcSStefano Zampini 942087f3262SPaul Mullowney /* lower triangular indices */ 943087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 944087f3262SPaul Mullowney if (!perm_identity) { 9454e4bbfaaSStefano Zampini IS iip; 946da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9474e4bbfaaSStefano Zampini 9484e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9494e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 950da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 951aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 952aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 953aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9544e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9554e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9564e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 957087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 958da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 959da79fbbcSStefano Zampini } 960087f3262SPaul Mullowney PetscFunctionReturn(0); 961087f3262SPaul Mullowney } 962087f3262SPaul Mullowney 963087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 964087f3262SPaul Mullowney { 965087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 966087f3262SPaul Mullowney IS ip = b->row; 967087f3262SPaul Mullowney PetscBool perm_identity; 968b175d8bbSPaul Mullowney PetscErrorCode ierr; 969087f3262SPaul Mullowney 970087f3262SPaul Mullowney PetscFunctionBegin; 97157181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 972087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 973ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 974087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 975087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 976087f3262SPaul Mullowney if (perm_identity) { 977087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 978087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9794e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9804e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 981087f3262SPaul Mullowney } else { 982087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 983087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9844e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9854e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 986087f3262SPaul Mullowney } 987087f3262SPaul Mullowney 988087f3262SPaul Mullowney /* get the triangular factors */ 989087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 990087f3262SPaul Mullowney PetscFunctionReturn(0); 991087f3262SPaul Mullowney } 9929ae82921SPaul Mullowney 993b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 994bda325fcSPaul Mullowney { 995bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 996aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 997aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 998da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 999da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1000bda325fcSPaul Mullowney cusparseStatus_t stat; 1001aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1002aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1003aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1004aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10051b0a6780SStefano Zampini cudaError_t cerr; 1006da79fbbcSStefano Zampini PetscErrorCode ierr; 1007b175d8bbSPaul Mullowney 1008bda325fcSPaul Mullowney PetscFunctionBegin; 1009aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1010da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1011da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1012aa372e3fSPaul Mullowney 1013aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1014aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1015aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1016aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1017aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1018aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1019aa372e3fSPaul Mullowney 1020aa372e3fSPaul Mullowney /* Create the matrix description */ 102157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 102257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 102357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 102457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 102557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1026aa372e3fSPaul Mullowney 1027aa372e3fSPaul Mullowney /* set the operation */ 1028aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1029aa372e3fSPaul Mullowney 1030aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1031aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1032afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1033afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1034aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1035afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1036afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1037afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1038aa372e3fSPaul Mullowney 1039aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1040afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1041afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1042afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1043afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1044afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1045afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1046afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1047afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1049afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10501b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1051afb2bd1cSJunchao Zhang #endif 1052afb2bd1cSJunchao Zhang 1053da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1054aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1055aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1056aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1057aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1058aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1059aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1060afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1061afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1062afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1063d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1064afb2bd1cSJunchao Zhang #else 1065afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1066d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1067afb2bd1cSJunchao Zhang #endif 1068da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1069da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1070aa372e3fSPaul Mullowney 1071afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1072da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1073afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1075afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1079afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1080afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1081afb2bd1cSJunchao Zhang #endif 1082afb2bd1cSJunchao Zhang 1083afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1084aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1085afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1086afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1087d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 10881b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1089d49cd2b7SBarry Smith loTriFactorT->solveInfo, 1090d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1091d49cd2b7SBarry Smith #else 1092d49cd2b7SBarry Smith loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1093afb2bd1cSJunchao Zhang #endif 1094da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1095da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1096aa372e3fSPaul Mullowney 1097da79fbbcSStefano Zampini /* assign the pointer */ 1098aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1099aa372e3fSPaul Mullowney 1100aa372e3fSPaul Mullowney /*********************************************/ 1101aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1102aa372e3fSPaul Mullowney /*********************************************/ 1103aa372e3fSPaul Mullowney 1104aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1105da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1106da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1107aa372e3fSPaul Mullowney 1108aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1109aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1110aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1111aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1112aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1113aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1114aa372e3fSPaul Mullowney 1115aa372e3fSPaul Mullowney /* Create the matrix description */ 111657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 111757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 111857d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 111957d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 112057d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1121aa372e3fSPaul Mullowney 1122aa372e3fSPaul Mullowney /* set the operation */ 1123aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1124aa372e3fSPaul Mullowney 1125aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1126aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1127afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1128afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1129aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1130afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1131afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1132afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1133aa372e3fSPaul Mullowney 1134aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1135afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1136afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1137afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1138afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1139afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1140afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1141afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1142afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1143afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1144afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1145afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1146afb2bd1cSJunchao Zhang #endif 1147afb2bd1cSJunchao Zhang 1148da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1149aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1150aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1151aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1152aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1153aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1154aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1155afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1156afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1157afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1158d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1159afb2bd1cSJunchao Zhang #else 1160afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1161d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1162afb2bd1cSJunchao Zhang #endif 1163d49cd2b7SBarry Smith 1164da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1165da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1166aa372e3fSPaul Mullowney 1167afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1168da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1169afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1171afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1175afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1176afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1177afb2bd1cSJunchao Zhang #endif 1178afb2bd1cSJunchao Zhang 1179afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1180aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1181afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1182afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1183d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 11841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1185d49cd2b7SBarry Smith upTriFactorT->solveInfo, 1186d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1187d49cd2b7SBarry Smith #else 1188d49cd2b7SBarry Smith upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1189afb2bd1cSJunchao Zhang #endif 1190d49cd2b7SBarry Smith 1191da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193aa372e3fSPaul Mullowney 1194da79fbbcSStefano Zampini /* assign the pointer */ 1195aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196bda325fcSPaul Mullowney PetscFunctionReturn(0); 1197bda325fcSPaul Mullowney } 1198bda325fcSPaul Mullowney 1199a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1200a49f1ed0SStefano Zampini { 1201a49f1ed0SStefano Zampini __host__ __device__ 1202a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1203a49f1ed0SStefano Zampini { 1204a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1205a49f1ed0SStefano Zampini } 1206a49f1ed0SStefano Zampini }; 1207a49f1ed0SStefano Zampini 12083606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1209bda325fcSPaul Mullowney { 1210aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213bda325fcSPaul Mullowney cusparseStatus_t stat; 1214aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1215b06137fdSPaul Mullowney cudaError_t err; 121685ba7357SStefano Zampini PetscErrorCode ierr; 1217b175d8bbSPaul Mullowney 1218bda325fcSPaul Mullowney PetscFunctionBegin; 1219a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1220a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1221e8d2b73aSMark Adams if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1222a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1223e8d2b73aSMark Adams if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12241a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 122585ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1226ee7b52eaSHong Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1227a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229a49f1ed0SStefano Zampini } 1230a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 123257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 123457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 123557d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236aa372e3fSPaul Mullowney 1237b06137fdSPaul Mullowney /* set alpha and beta */ 1238afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12397656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12407656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12427656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12437656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244b06137fdSPaul Mullowney 1245aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1247a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1248554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1249554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1250aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1251a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1254a3fdcf43SKarl Rupp 1255039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 125681902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257afb2bd1cSJunchao Zhang 1258afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 12593606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1260afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1261afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1264afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 12663606e59fSJunchao Zhang #else 12673606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12683606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12693606e59fSJunchao Zhang 12703606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12713606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12723606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12733606e59fSJunchao Zhang */ 12743606e59fSJunchao Zhang if (matrixT->num_entries) { 12753606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 12763606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 12773606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 12783606e59fSJunchao Zhang matrixT->values->data().get(), 12793606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 12803606e59fSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 12813606e59fSJunchao Zhang 12823606e59fSJunchao Zhang } else { 12833606e59fSJunchao Zhang matstructT->matDescr = NULL; 12843606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12853606e59fSJunchao Zhang } 12863606e59fSJunchao Zhang #endif 1287afb2bd1cSJunchao Zhang #endif 1288aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1289afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1290afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1291afb2bd1cSJunchao Zhang #else 1292aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 129351c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 129451c6d536SStefano Zampini /* First convert HYB to CSR */ 1295aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1296aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1297aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1298aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1299aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1300aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1301aa372e3fSPaul Mullowney 1302aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1303aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1304aa372e3fSPaul Mullowney temp->values->data().get(), 1305aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 130657d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1307aa372e3fSPaul Mullowney 1308aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1309aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1310aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1311aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1312aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1313aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1314aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1315aa372e3fSPaul Mullowney 1316aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1317aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1318aa372e3fSPaul Mullowney temp->values->data().get(), 1319aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1320aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1321aa372e3fSPaul Mullowney tempT->values->data().get(), 1322aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1323aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 132457d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1325aa372e3fSPaul Mullowney 1326aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1327aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 132857d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1329aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1330aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1331aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1332aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1333aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1334aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 133557d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1336aa372e3fSPaul Mullowney 1337aa372e3fSPaul Mullowney /* assign the pointer */ 1338aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13391a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1340aa372e3fSPaul Mullowney /* delete temporaries */ 1341aa372e3fSPaul Mullowney if (tempT) { 1342aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1343aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1344aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1345aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1346087f3262SPaul Mullowney } 1347aa372e3fSPaul Mullowney if (temp) { 1348aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1349aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1350aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1351aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1352aa372e3fSPaul Mullowney } 1353afb2bd1cSJunchao Zhang #endif 1354aa372e3fSPaul Mullowney } 1355a49f1ed0SStefano Zampini } 1356a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1357a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1358a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1359e8d2b73aSMark Adams if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1360e8d2b73aSMark Adams if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1361e8d2b73aSMark Adams if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1362e8d2b73aSMark Adams if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1363e8d2b73aSMark Adams if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1364e8d2b73aSMark Adams if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1365e8d2b73aSMark Adams if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1366e8d2b73aSMark Adams if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1367a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1368a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1369a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1370a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1371a49f1ed0SStefano Zampini } 1372a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1373a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1374a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1375a49f1ed0SStefano Zampini 1376a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1377a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1378a49f1ed0SStefano Zampini void *csr2cscBuffer; 1379a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1380a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1381a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1382a49f1ed0SStefano Zampini matrix->values->data().get(), 1383a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1384a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1385a49f1ed0SStefano Zampini matrixT->values->data().get(), 1386a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1387a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1388a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1389a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1390a49f1ed0SStefano Zampini #endif 1391a49f1ed0SStefano Zampini 13921a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13931a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13941a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13951a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13961a2c6b5cSJunchao Zhang 13971a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13981a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13991a2c6b5cSJunchao Zhang */ 14001a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 14011a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 14021a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 14031a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 14041a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1405a49f1ed0SStefano Zampini matrixT->values->data().get(), 1406a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1407a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1408a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 14091a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1410a49f1ed0SStefano Zampini #else 1411a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14121a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1413a49f1ed0SStefano Zampini #endif 14141a2c6b5cSJunchao Zhang } else { 14151a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14161a2c6b5cSJunchao Zhang } 14171a2c6b5cSJunchao Zhang 1418a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1419a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1420a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1421a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1422a49f1ed0SStefano Zampini #endif 1423a49f1ed0SStefano Zampini } 1424a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1425a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1426a49f1ed0SStefano Zampini matrixT->values->begin())); 1427a49f1ed0SStefano Zampini } 1428ee7b52eaSHong Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 142985ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1430213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1431213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1432aa372e3fSPaul Mullowney /* assign the pointer */ 1433aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14341a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1435bda325fcSPaul Mullowney PetscFunctionReturn(0); 1436bda325fcSPaul Mullowney } 1437bda325fcSPaul Mullowney 1438a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14396fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1440bda325fcSPaul Mullowney { 1441c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1442465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1443465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1444465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1445465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1446bda325fcSPaul Mullowney cusparseStatus_t stat; 1447bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1448aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1449aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1450aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1451b175d8bbSPaul Mullowney PetscErrorCode ierr; 1452bda325fcSPaul Mullowney 1453bda325fcSPaul Mullowney PetscFunctionBegin; 1454aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1455aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1456bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1457aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1458aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1459bda325fcSPaul Mullowney } 1460bda325fcSPaul Mullowney 1461bda325fcSPaul Mullowney /* Get the GPU pointers */ 1462c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1463c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1464c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1465c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1466bda325fcSPaul Mullowney 14677a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1469a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1470c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1471c41cb2e2SAlejandro Lamas Daviña xGPU); 1472aa372e3fSPaul Mullowney 1473aa372e3fSPaul Mullowney /* First, solve U */ 1474aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1475afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1477afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1478afb2bd1cSJunchao Zhang #endif 1479afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1480aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1481aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1482aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1483aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1484d49cd2b7SBarry Smith xarray, 14851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1486d49cd2b7SBarry Smith tempGPU->data().get(), 1487d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1488d49cd2b7SBarry Smith #else 1489d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1490afb2bd1cSJunchao Zhang #endif 1491aa372e3fSPaul Mullowney 1492aa372e3fSPaul Mullowney /* Then, solve L */ 1493aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1494afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14951b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1496afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1497afb2bd1cSJunchao Zhang #endif 1498afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1499aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1500aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1501aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1502aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1503d49cd2b7SBarry Smith tempGPU->data().get(), 15041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1505d49cd2b7SBarry Smith xarray, 1506d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1507d49cd2b7SBarry Smith #else 1508d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1509afb2bd1cSJunchao Zhang #endif 1510aa372e3fSPaul Mullowney 1511aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1512a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1513c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1514aa372e3fSPaul Mullowney tempGPU->begin()); 1515aa372e3fSPaul Mullowney 1516aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1517a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1518bda325fcSPaul Mullowney 1519bda325fcSPaul Mullowney /* restore */ 1520c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1521c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1522661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1523958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1524bda325fcSPaul Mullowney PetscFunctionReturn(0); 1525bda325fcSPaul Mullowney } 1526bda325fcSPaul Mullowney 15276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1528bda325fcSPaul Mullowney { 1529465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1530465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1531bda325fcSPaul Mullowney cusparseStatus_t stat; 1532bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1533aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1534aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1535aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1536b175d8bbSPaul Mullowney PetscErrorCode ierr; 1537bda325fcSPaul Mullowney 1538bda325fcSPaul Mullowney PetscFunctionBegin; 1539aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1540aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1541bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1542aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1543aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1544bda325fcSPaul Mullowney } 1545bda325fcSPaul Mullowney 1546bda325fcSPaul Mullowney /* Get the GPU pointers */ 1547c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1548c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1549bda325fcSPaul Mullowney 15507a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1551aa372e3fSPaul Mullowney /* First, solve U */ 1552aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1553afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1556afb2bd1cSJunchao Zhang #endif 1557afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1558aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1559aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1560aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1561aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1562d49cd2b7SBarry Smith barray, 15631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564d49cd2b7SBarry Smith tempGPU->data().get(), 1565d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1566d49cd2b7SBarry Smith #else 1567d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1568afb2bd1cSJunchao Zhang #endif 1569aa372e3fSPaul Mullowney 1570aa372e3fSPaul Mullowney /* Then, solve L */ 1571aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1572afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1574afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1575afb2bd1cSJunchao Zhang #endif 1576afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1577aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1578aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1579aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1580aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1581d49cd2b7SBarry Smith tempGPU->data().get(), 15821b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1583d49cd2b7SBarry Smith xarray, 1584d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1585d49cd2b7SBarry Smith #else 1586d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1587afb2bd1cSJunchao Zhang #endif 1588bda325fcSPaul Mullowney 1589bda325fcSPaul Mullowney /* restore */ 1590c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1591c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1592661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1593958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1594bda325fcSPaul Mullowney PetscFunctionReturn(0); 1595bda325fcSPaul Mullowney } 1596bda325fcSPaul Mullowney 15976fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15989ae82921SPaul Mullowney { 1599465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1600465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1601465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1602465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16039ae82921SPaul Mullowney cusparseStatus_t stat; 16049ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1605aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1606aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1607aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1608b175d8bbSPaul Mullowney PetscErrorCode ierr; 16099ae82921SPaul Mullowney 16109ae82921SPaul Mullowney PetscFunctionBegin; 1611ebc8f436SDominic Meiser 1612e057df02SPaul Mullowney /* Get the GPU pointers */ 1613c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1614c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1615c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1616c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16179ae82921SPaul Mullowney 16187a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1619aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1620a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1621c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16224e4bbfaaSStefano Zampini tempGPU->begin()); 1623aa372e3fSPaul Mullowney 1624aa372e3fSPaul Mullowney /* Next, solve L */ 1625aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1626afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16271b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1629afb2bd1cSJunchao Zhang #endif 1630afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1631aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1632aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1633aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1634aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1635d49cd2b7SBarry Smith tempGPU->data().get(), 16361b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1637d49cd2b7SBarry Smith xarray, 1638d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1639d49cd2b7SBarry Smith #else 1640d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1641afb2bd1cSJunchao Zhang #endif 1642aa372e3fSPaul Mullowney 1643aa372e3fSPaul Mullowney /* Then, solve U */ 1644aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1645afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1647afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1648afb2bd1cSJunchao Zhang #endif 1649afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1650aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1651aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1652aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1653d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 16541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1655d49cd2b7SBarry Smith tempGPU->data().get(), 1656d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1657d49cd2b7SBarry Smith #else 1658d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1659afb2bd1cSJunchao Zhang #endif 1660d49cd2b7SBarry Smith 16614e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1662a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16634e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16644e4bbfaaSStefano Zampini xGPU); 16659ae82921SPaul Mullowney 1666c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1667c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1668661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1669958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16709ae82921SPaul Mullowney PetscFunctionReturn(0); 16719ae82921SPaul Mullowney } 16729ae82921SPaul Mullowney 16736fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16749ae82921SPaul Mullowney { 1675465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1676465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16779ae82921SPaul Mullowney cusparseStatus_t stat; 16789ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1679aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1680aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1681aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1682b175d8bbSPaul Mullowney PetscErrorCode ierr; 16839ae82921SPaul Mullowney 16849ae82921SPaul Mullowney PetscFunctionBegin; 1685e057df02SPaul Mullowney /* Get the GPU pointers */ 1686c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1687c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 16889ae82921SPaul Mullowney 16897a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1690aa372e3fSPaul Mullowney /* First, solve L */ 1691aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1692afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1694afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1695afb2bd1cSJunchao Zhang #endif 1696afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1697aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1698aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1699aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1700aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1701d49cd2b7SBarry Smith barray, 17021b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1703d49cd2b7SBarry Smith tempGPU->data().get(), 1704d49cd2b7SBarry Smith loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1705d49cd2b7SBarry Smith #else 1706d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1707afb2bd1cSJunchao Zhang #endif 1708d49cd2b7SBarry Smith 1709aa372e3fSPaul Mullowney /* Next, solve U */ 1710aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1711afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17121b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1713afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1714afb2bd1cSJunchao Zhang #endif 1715afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1716aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1717aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1718aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1719aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1720d49cd2b7SBarry Smith tempGPU->data().get(), 17211b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1722d49cd2b7SBarry Smith xarray, 1723d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1724d49cd2b7SBarry Smith #else 1725d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1726afb2bd1cSJunchao Zhang #endif 17279ae82921SPaul Mullowney 1728c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1729c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1730661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1731958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17329ae82921SPaul Mullowney PetscFunctionReturn(0); 17339ae82921SPaul Mullowney } 17349ae82921SPaul Mullowney 17357e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17367e8381f9SStefano Zampini { 17377e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17387e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17397e8381f9SStefano Zampini cudaError_t cerr; 17407e8381f9SStefano Zampini PetscErrorCode ierr; 17417e8381f9SStefano Zampini 17427e8381f9SStefano Zampini PetscFunctionBegin; 17437e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17447e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17457e8381f9SStefano Zampini 17467e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17477e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 17487e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 17497e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 17507e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17517e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17527e8381f9SStefano Zampini } 17537e8381f9SStefano Zampini PetscFunctionReturn(0); 17547e8381f9SStefano Zampini } 17557e8381f9SStefano Zampini 17567e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17577e8381f9SStefano Zampini { 17587e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17597e8381f9SStefano Zampini PetscErrorCode ierr; 17607e8381f9SStefano Zampini 17617e8381f9SStefano Zampini PetscFunctionBegin; 17627e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 17637e8381f9SStefano Zampini *array = a->a; 17647e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 17657e8381f9SStefano Zampini PetscFunctionReturn(0); 17667e8381f9SStefano Zampini } 17677e8381f9SStefano Zampini 1768042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 17699ae82921SPaul Mullowney { 1770aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 17717c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 17729ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1773213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 17749ae82921SPaul Mullowney PetscErrorCode ierr; 1775aa372e3fSPaul Mullowney cusparseStatus_t stat; 1776abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1777b06137fdSPaul Mullowney cudaError_t err; 17789ae82921SPaul Mullowney 17799ae82921SPaul Mullowney PetscFunctionBegin; 1780e8d2b73aSMark Adams if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1781c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1782a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1783a49f1ed0SStefano Zampini CsrMatrix *matrix; 1784afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 178585ba7357SStefano Zampini 1786e8d2b73aSMark Adams if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 178785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1788afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 178905035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 17904863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 179185ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1792a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 179334d6c7a5SJose E. Roman } else { 1794abb89eb1SStefano Zampini PetscInt nnz; 179585ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17967c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1797a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 17987c700b8dSJunchao Zhang delete cusparsestruct->workVector; 179981902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1800a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1801a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18029ae82921SPaul Mullowney try { 18039ae82921SPaul Mullowney if (a->compressedrow.use) { 18049ae82921SPaul Mullowney m = a->compressedrow.nrows; 18059ae82921SPaul Mullowney ii = a->compressedrow.i; 18069ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 18079ae82921SPaul Mullowney } else { 1808213423ffSJunchao Zhang m = A->rmap->n; 1809213423ffSJunchao Zhang ii = a->i; 1810e6e9a74fSStefano Zampini ridx = NULL; 18119ae82921SPaul Mullowney } 1812e8d2b73aSMark Adams if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1813e8d2b73aSMark Adams if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1814abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1815abb89eb1SStefano Zampini else nnz = a->nz; 18169ae82921SPaul Mullowney 181785ba7357SStefano Zampini /* create cusparse matrix */ 1818abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1819aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 182057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 182157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 182257d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 18239ae82921SPaul Mullowney 1824afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 18257656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 18267656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1827afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 18287656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 18297656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 183057d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1831b06137fdSPaul Mullowney 1832aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1833aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1834aa372e3fSPaul Mullowney /* set the matrix */ 1835afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1836afb2bd1cSJunchao Zhang mat->num_rows = m; 1837afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1838abb89eb1SStefano Zampini mat->num_entries = nnz; 1839afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1840afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18419ae82921SPaul Mullowney 1842abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1843abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1844aa372e3fSPaul Mullowney 1845abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1846abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1847aa372e3fSPaul Mullowney 1848aa372e3fSPaul Mullowney /* assign the pointer */ 1849afb2bd1cSJunchao Zhang matstruct->mat = mat; 1850afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1851afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1852afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1853afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1854afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1855afb2bd1cSJunchao Zhang mat->values->data().get(), 1856afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1857afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1858afb2bd1cSJunchao Zhang } 1859afb2bd1cSJunchao Zhang #endif 1860aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1861afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1862afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1863afb2bd1cSJunchao Zhang #else 1864afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1865afb2bd1cSJunchao Zhang mat->num_rows = m; 1866afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1867abb89eb1SStefano Zampini mat->num_entries = nnz; 1868afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1869afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1870aa372e3fSPaul Mullowney 1871abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1872abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1873aa372e3fSPaul Mullowney 1874abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1875abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1876aa372e3fSPaul Mullowney 1877aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 187857d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1879aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1880aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1881afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1882afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1883afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1884afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 188557d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1886aa372e3fSPaul Mullowney /* assign the pointer */ 1887aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1888aa372e3fSPaul Mullowney 1889afb2bd1cSJunchao Zhang if (mat) { 1890afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1891afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1892afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1893afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1894087f3262SPaul Mullowney } 1895afb2bd1cSJunchao Zhang #endif 1896087f3262SPaul Mullowney } 1897ca45077fSPaul Mullowney 1898aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1899213423ffSJunchao Zhang if (a->compressedrow.use) { 1900213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1901aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1902aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1903213423ffSJunchao Zhang tmp = m; 1904213423ffSJunchao Zhang } else { 1905213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1906213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1907213423ffSJunchao Zhang tmp = 0; 1908213423ffSJunchao Zhang } 1909213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1910aa372e3fSPaul Mullowney 1911aa372e3fSPaul Mullowney /* assign the pointer */ 1912aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 19139ae82921SPaul Mullowney } catch(char *ex) { 19149ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 19159ae82921SPaul Mullowney } 191605035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 191785ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 191834d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 191934d6c7a5SJose E. Roman } 1920abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 19219ae82921SPaul Mullowney } 19229ae82921SPaul Mullowney PetscFunctionReturn(0); 19239ae82921SPaul Mullowney } 19249ae82921SPaul Mullowney 1925c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1926aa372e3fSPaul Mullowney { 1927aa372e3fSPaul Mullowney template <typename Tuple> 1928aa372e3fSPaul Mullowney __host__ __device__ 1929aa372e3fSPaul Mullowney void operator()(Tuple t) 1930aa372e3fSPaul Mullowney { 1931aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1932aa372e3fSPaul Mullowney } 1933aa372e3fSPaul Mullowney }; 1934aa372e3fSPaul Mullowney 19357e8381f9SStefano Zampini struct VecCUDAEquals 19367e8381f9SStefano Zampini { 19377e8381f9SStefano Zampini template <typename Tuple> 19387e8381f9SStefano Zampini __host__ __device__ 19397e8381f9SStefano Zampini void operator()(Tuple t) 19407e8381f9SStefano Zampini { 19417e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19427e8381f9SStefano Zampini } 19437e8381f9SStefano Zampini }; 19447e8381f9SStefano Zampini 1945e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1946e6e9a74fSStefano Zampini { 1947e6e9a74fSStefano Zampini template <typename Tuple> 1948e6e9a74fSStefano Zampini __host__ __device__ 1949e6e9a74fSStefano Zampini void operator()(Tuple t) 1950e6e9a74fSStefano Zampini { 1951e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1952e6e9a74fSStefano Zampini } 1953e6e9a74fSStefano Zampini }; 1954e6e9a74fSStefano Zampini 1955afb2bd1cSJunchao Zhang struct MatMatCusparse { 1956ccdfe979SStefano Zampini PetscBool cisdense; 1957ccdfe979SStefano Zampini PetscScalar *Bt; 1958ccdfe979SStefano Zampini Mat X; 1959fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1960fcdce8c4SStefano Zampini PetscLogDouble flops; 1961fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 1962b4285af6SJunchao Zhang 1963afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1964fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 1965afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1966afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1967afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1968afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1969b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1970b4285af6SJunchao Zhang void *dBuffer4; 1971b4285af6SJunchao Zhang void *dBuffer5; 1972b4285af6SJunchao Zhang #endif 1973fcdce8c4SStefano Zampini size_t mmBufferSize; 1974fcdce8c4SStefano Zampini void *mmBuffer; 1975fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1976fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 1977afb2bd1cSJunchao Zhang #endif 1978afb2bd1cSJunchao Zhang }; 1979ccdfe979SStefano Zampini 1980ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1981ccdfe979SStefano Zampini { 1982ccdfe979SStefano Zampini PetscErrorCode ierr; 1983ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1984ccdfe979SStefano Zampini cudaError_t cerr; 1985fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1986fcdce8c4SStefano Zampini cusparseStatus_t stat; 1987fcdce8c4SStefano Zampini #endif 1988ccdfe979SStefano Zampini 1989ccdfe979SStefano Zampini PetscFunctionBegin; 1990ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1991fcdce8c4SStefano Zampini delete mmdata->Bcsr; 1992afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1993fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1994afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1995afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1996fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1997b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1998b4285af6SJunchao Zhang if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1999b4285af6SJunchao Zhang if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2000b4285af6SJunchao Zhang #endif 2001b4285af6SJunchao Zhang if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2002b4285af6SJunchao Zhang if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2003afb2bd1cSJunchao Zhang #endif 2004ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2005ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 2006ccdfe979SStefano Zampini PetscFunctionReturn(0); 2007ccdfe979SStefano Zampini } 2008ccdfe979SStefano Zampini 2009ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2010ccdfe979SStefano Zampini 2011ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2012ccdfe979SStefano Zampini { 2013ccdfe979SStefano Zampini Mat_Product *product = C->product; 2014ccdfe979SStefano Zampini Mat A,B; 2015afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2016ccdfe979SStefano Zampini PetscBool flg,biscuda; 2017ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2018ccdfe979SStefano Zampini cusparseStatus_t stat; 2019ccdfe979SStefano Zampini cusparseOperation_t opA; 2020ccdfe979SStefano Zampini const PetscScalar *barray; 2021ccdfe979SStefano Zampini PetscScalar *carray; 2022ccdfe979SStefano Zampini PetscErrorCode ierr; 2023ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2024ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2025ccdfe979SStefano Zampini CsrMatrix *csrmat; 2026ccdfe979SStefano Zampini 2027ccdfe979SStefano Zampini PetscFunctionBegin; 2028ccdfe979SStefano Zampini MatCheckProduct(C,1); 2029e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2030ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2031ccdfe979SStefano Zampini A = product->A; 2032ccdfe979SStefano Zampini B = product->B; 2033ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2034e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2035ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2036ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 2037ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2038ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2039ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2040ccdfe979SStefano Zampini switch (product->type) { 2041ccdfe979SStefano Zampini case MATPRODUCT_AB: 2042ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2043ccdfe979SStefano Zampini mat = cusp->mat; 2044ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2045ccdfe979SStefano Zampini m = A->rmap->n; 2046ccdfe979SStefano Zampini n = B->cmap->n; 2047ccdfe979SStefano Zampini break; 2048ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20491a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2050e6e9a74fSStefano Zampini mat = cusp->mat; 2051e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2052e6e9a74fSStefano Zampini } else { 20533606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2054ccdfe979SStefano Zampini mat = cusp->matTranspose; 2055ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2056e6e9a74fSStefano Zampini } 2057ccdfe979SStefano Zampini m = A->cmap->n; 2058ccdfe979SStefano Zampini n = B->cmap->n; 2059ccdfe979SStefano Zampini break; 2060ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2061ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2062ccdfe979SStefano Zampini mat = cusp->mat; 2063ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2064ccdfe979SStefano Zampini m = A->rmap->n; 2065ccdfe979SStefano Zampini n = B->rmap->n; 2066ccdfe979SStefano Zampini break; 2067ccdfe979SStefano Zampini default: 2068e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2069ccdfe979SStefano Zampini } 2070e8d2b73aSMark Adams if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2071ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2072ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2073ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2074afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2075ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2076afb2bd1cSJunchao Zhang 2077ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2078c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2079c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2080c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2081c8378d12SStefano Zampini } else { 2082c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2083c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2084c8378d12SStefano Zampini } 2085c8378d12SStefano Zampini 2086c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2087afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2089a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2090afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2091fcdce8c4SStefano Zampini size_t mmBufferSize; 2092afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2093afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2094afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2095afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2096afb2bd1cSJunchao Zhang } 2097c8378d12SStefano Zampini 2098afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2099afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2100afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2101afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2102afb2bd1cSJunchao Zhang } 2103afb2bd1cSJunchao Zhang 2104afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2105afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2106afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2107afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2108afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2109afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2110afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2111afb2bd1cSJunchao Zhang } 2112afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2113afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2114afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2115fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2116fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2117ee7b52eaSHong Zhang cudaError_t cerr; 2118fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2119fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2120fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2121fcdce8c4SStefano Zampini } 2122afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2123afb2bd1cSJunchao Zhang } else { 2124afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2125afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2126afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2127afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2128afb2bd1cSJunchao Zhang } 2129afb2bd1cSJunchao Zhang 2130afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2131afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2132afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2133afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2134fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2135afb2bd1cSJunchao Zhang #else 2136afb2bd1cSJunchao Zhang PetscInt k; 2137afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2138ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2139ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2140ccdfe979SStefano Zampini cublasStatus_t cerr; 2141ccdfe979SStefano Zampini 2142ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2143ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2144ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2145ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2146ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2147ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2148ccdfe979SStefano Zampini blda = B->cmap->n; 2149afb2bd1cSJunchao Zhang k = B->cmap->n; 2150afb2bd1cSJunchao Zhang } else { 2151afb2bd1cSJunchao Zhang k = B->rmap->n; 2152ccdfe979SStefano Zampini } 2153ccdfe979SStefano Zampini 2154afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2155ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2156afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2157ccdfe979SStefano Zampini csrmat->values->data().get(), 2158ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2159ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2160ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2161ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2162afb2bd1cSJunchao Zhang #endif 2163c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2164c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2165ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2166ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2167ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2168ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2169ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2170ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2171ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2172ccdfe979SStefano Zampini } else { 2173ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2174ccdfe979SStefano Zampini } 2175ccdfe979SStefano Zampini if (mmdata->cisdense) { 2176ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2177ccdfe979SStefano Zampini } 2178ccdfe979SStefano Zampini if (!biscuda) { 2179ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2180ccdfe979SStefano Zampini } 2181ccdfe979SStefano Zampini PetscFunctionReturn(0); 2182ccdfe979SStefano Zampini } 2183ccdfe979SStefano Zampini 2184ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2185ccdfe979SStefano Zampini { 2186ccdfe979SStefano Zampini Mat_Product *product = C->product; 2187ccdfe979SStefano Zampini Mat A,B; 2188ccdfe979SStefano Zampini PetscInt m,n; 2189ccdfe979SStefano Zampini PetscBool cisdense,flg; 2190ccdfe979SStefano Zampini PetscErrorCode ierr; 2191ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2192ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2193ccdfe979SStefano Zampini 2194ccdfe979SStefano Zampini PetscFunctionBegin; 2195ccdfe979SStefano Zampini MatCheckProduct(C,1); 2196e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2197ccdfe979SStefano Zampini A = product->A; 2198ccdfe979SStefano Zampini B = product->B; 2199ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2200e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2201ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2202e8d2b73aSMark Adams if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2203ccdfe979SStefano Zampini switch (product->type) { 2204ccdfe979SStefano Zampini case MATPRODUCT_AB: 2205ccdfe979SStefano Zampini m = A->rmap->n; 2206ccdfe979SStefano Zampini n = B->cmap->n; 2207ccdfe979SStefano Zampini break; 2208ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2209ccdfe979SStefano Zampini m = A->cmap->n; 2210ccdfe979SStefano Zampini n = B->cmap->n; 2211ccdfe979SStefano Zampini break; 2212ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2213ccdfe979SStefano Zampini m = A->rmap->n; 2214ccdfe979SStefano Zampini n = B->rmap->n; 2215ccdfe979SStefano Zampini break; 2216ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2217ccdfe979SStefano Zampini m = B->cmap->n; 2218ccdfe979SStefano Zampini n = B->cmap->n; 2219ccdfe979SStefano Zampini break; 2220ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2221ccdfe979SStefano Zampini m = B->rmap->n; 2222ccdfe979SStefano Zampini n = B->rmap->n; 2223ccdfe979SStefano Zampini break; 2224ccdfe979SStefano Zampini default: 2225e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2226ccdfe979SStefano Zampini } 2227ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2228ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2229ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2230ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2231ccdfe979SStefano Zampini 2232ccdfe979SStefano Zampini /* product data */ 2233ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2234ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2235afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2236afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2237ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2238afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2239ccdfe979SStefano Zampini } 2240afb2bd1cSJunchao Zhang #endif 2241ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2242ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2243ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2244ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2245ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2246ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2247ccdfe979SStefano Zampini } else { 2248ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2249ccdfe979SStefano Zampini } 2250ccdfe979SStefano Zampini } 2251ccdfe979SStefano Zampini C->product->data = mmdata; 2252ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2253ccdfe979SStefano Zampini 2254ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2255ccdfe979SStefano Zampini PetscFunctionReturn(0); 2256ccdfe979SStefano Zampini } 2257ccdfe979SStefano Zampini 2258fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2259ccdfe979SStefano Zampini { 2260ccdfe979SStefano Zampini Mat_Product *product = C->product; 2261fcdce8c4SStefano Zampini Mat A,B; 2262fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2263fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2264fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2265fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2266fcdce8c4SStefano Zampini PetscBool flg; 2267ccdfe979SStefano Zampini PetscErrorCode ierr; 2268fcdce8c4SStefano Zampini cusparseStatus_t stat; 2269fcdce8c4SStefano Zampini cudaError_t cerr; 2270fcdce8c4SStefano Zampini MatProductType ptype; 2271fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2272fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2273fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2274fcdce8c4SStefano Zampini #endif 2275b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2276ccdfe979SStefano Zampini 2277ccdfe979SStefano Zampini PetscFunctionBegin; 2278ccdfe979SStefano Zampini MatCheckProduct(C,1); 2279e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2280fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2281e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2282fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2283fcdce8c4SStefano Zampini A = product->A; 2284fcdce8c4SStefano Zampini B = product->B; 2285fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2286fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2287fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2288e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2289fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2290e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2291fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2292e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2293fcdce8c4SStefano Zampini goto finalize; 2294fcdce8c4SStefano Zampini } 2295fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2296fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2297e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2298fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2299e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2300fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2301fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2302fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2303fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2304fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2305e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2306e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2307e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2308fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2309fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2310fcdce8c4SStefano Zampini 2311fcdce8c4SStefano Zampini ptype = product->type; 2312fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2313fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2314fcdce8c4SStefano Zampini switch (ptype) { 2315fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2316fcdce8c4SStefano Zampini Amat = Acusp->mat; 2317fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2318fcdce8c4SStefano Zampini break; 2319fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2320fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2321fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2322fcdce8c4SStefano Zampini break; 2323fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2324fcdce8c4SStefano Zampini Amat = Acusp->mat; 2325fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2326fcdce8c4SStefano Zampini break; 2327fcdce8c4SStefano Zampini default: 2328e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2329fcdce8c4SStefano Zampini } 2330fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2331e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2332e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2333e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2334fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2335fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2336fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2337e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2338e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2339e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2340fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2341fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2342fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2343b4285af6SJunchao Zhang stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2344b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2345b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2346b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2347b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2348b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2349b4285af6SJunchao Zhang #else 2350b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2351fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2352fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2353fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2354b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2355fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2356fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2357b4285af6SJunchao Zhang #endif 2358fcdce8c4SStefano Zampini #else 2359b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2360fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2361fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2362fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2363fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2364fcdce8c4SStefano Zampini #endif 2365fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2366fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2367fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2368fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2369fcdce8c4SStefano Zampini finalize: 2370fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2371fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2372fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2373fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2374fcdce8c4SStefano Zampini c->reallocs = 0; 2375fcdce8c4SStefano Zampini C->info.mallocs += 0; 2376fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2377fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2378fcdce8c4SStefano Zampini C->num_ass++; 2379ccdfe979SStefano Zampini PetscFunctionReturn(0); 2380ccdfe979SStefano Zampini } 2381fcdce8c4SStefano Zampini 2382fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2383fcdce8c4SStefano Zampini { 2384fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2385fcdce8c4SStefano Zampini Mat A,B; 2386fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2387fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2388fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2389fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2390fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2391fcdce8c4SStefano Zampini PetscBool flg; 2392fcdce8c4SStefano Zampini PetscErrorCode ierr; 2393fcdce8c4SStefano Zampini cusparseStatus_t stat; 2394fcdce8c4SStefano Zampini cudaError_t cerr; 2395fcdce8c4SStefano Zampini MatProductType ptype; 2396fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2397fcdce8c4SStefano Zampini PetscLogDouble flops; 2398fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2399fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2400fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2401fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2402fcdce8c4SStefano Zampini #else 2403fcdce8c4SStefano Zampini int cnz; 2404fcdce8c4SStefano Zampini #endif 2405b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2406fcdce8c4SStefano Zampini 2407fcdce8c4SStefano Zampini PetscFunctionBegin; 2408fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2409e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2410fcdce8c4SStefano Zampini A = product->A; 2411fcdce8c4SStefano Zampini B = product->B; 2412fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2413e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2414fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2415e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2416fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2417fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2418fcdce8c4SStefano Zampini /* product data */ 2419fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2420fcdce8c4SStefano Zampini C->product->data = mmdata; 2421fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2422fcdce8c4SStefano Zampini 2423fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2424fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2425*d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2426*d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2427*d60bce21SJunchao Zhang if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2428*d60bce21SJunchao Zhang if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2429*d60bce21SJunchao Zhang 2430fcdce8c4SStefano Zampini ptype = product->type; 2431fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2432fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2433fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2434fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2435fcdce8c4SStefano Zampini switch (ptype) { 2436fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2437fcdce8c4SStefano Zampini m = A->rmap->n; 2438fcdce8c4SStefano Zampini n = B->cmap->n; 2439fcdce8c4SStefano Zampini k = A->cmap->n; 2440fcdce8c4SStefano Zampini Amat = Acusp->mat; 2441fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2442fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2443fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2444fcdce8c4SStefano Zampini break; 2445fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2446fcdce8c4SStefano Zampini m = A->cmap->n; 2447fcdce8c4SStefano Zampini n = B->cmap->n; 2448fcdce8c4SStefano Zampini k = A->rmap->n; 24493606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2450fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2451fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2452fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2453fcdce8c4SStefano Zampini break; 2454fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2455fcdce8c4SStefano Zampini m = A->rmap->n; 2456fcdce8c4SStefano Zampini n = B->rmap->n; 2457fcdce8c4SStefano Zampini k = A->cmap->n; 24583606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2459fcdce8c4SStefano Zampini Amat = Acusp->mat; 2460fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2461fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2462fcdce8c4SStefano Zampini break; 2463fcdce8c4SStefano Zampini default: 2464e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2465fcdce8c4SStefano Zampini } 2466fcdce8c4SStefano Zampini 2467fcdce8c4SStefano Zampini /* create cusparse matrix */ 2468fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2469fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2470fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2471fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2472fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2473fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2474fcdce8c4SStefano Zampini 2475fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2476fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2477fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2478fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2479fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2480fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2481fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2482fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2483fcdce8c4SStefano Zampini } else { 2484fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2485fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2486fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2487fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2488fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2489fcdce8c4SStefano Zampini } 2490fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2491fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2492fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2493fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2494fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2495fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2496fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2497fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2498fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2499fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2500fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2501fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2502fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2503fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2504fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2505fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2506fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2507fcdce8c4SStefano Zampini c->nz = 0; 2508fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2509fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2510fcdce8c4SStefano Zampini goto finalizesym; 2511fcdce8c4SStefano Zampini } 2512fcdce8c4SStefano Zampini 2513e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2514e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2515fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2516fcdce8c4SStefano Zampini if (!biscompressed) { 2517fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2518fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2519fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2520fcdce8c4SStefano Zampini #endif 2521fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2522fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2523fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2524fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2525fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2526fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2527fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2528fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2529fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2530fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2531fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2532fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2533fcdce8c4SStefano Zampini } 2534fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2535fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2536fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2537fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2538fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2539fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2540fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2541fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2542fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2543fcdce8c4SStefano Zampini } 2544fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2545fcdce8c4SStefano Zampini #endif 2546fcdce8c4SStefano Zampini } 2547e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2548e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2549fcdce8c4SStefano Zampini /* precompute flops count */ 2550fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2551fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2552fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2553fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2554fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2555fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2556fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2557fcdce8c4SStefano Zampini } 2558fcdce8c4SStefano Zampini } 2559fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2560fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2561fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2562fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2563fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2564fcdce8c4SStefano Zampini } 2565fcdce8c4SStefano Zampini } else { /* TODO */ 2566fcdce8c4SStefano Zampini flops = 0.; 2567fcdce8c4SStefano Zampini } 2568fcdce8c4SStefano Zampini 2569fcdce8c4SStefano Zampini mmdata->flops = flops; 2570fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2571b4285af6SJunchao Zhang 2572fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2573fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2574fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2575fcdce8c4SStefano Zampini NULL, NULL, NULL, 2576fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2577fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2578fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2579b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2580b4285af6SJunchao Zhang { 2581b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2582b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2583b4285af6SJunchao Zhang */ 2584b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2585b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2586b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2587b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2588b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2589b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2590b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2591b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2592b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2593b4285af6SJunchao Zhang 2594b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2595b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2596b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2597b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2598b4285af6SJunchao Zhang &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2599b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2600b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2601b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2602b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2603b4285af6SJunchao Zhang &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2604b4285af6SJunchao Zhang 2605b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2606b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2607b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2608b4285af6SJunchao Zhang &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2609b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2610b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2611b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2612b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2613b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2614b4285af6SJunchao Zhang &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2615b4285af6SJunchao Zhang cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2616b4285af6SJunchao Zhang cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2617b4285af6SJunchao Zhang 2618b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2619b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 2620b4285af6SJunchao Zhang stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2621b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2622b4285af6SJunchao Zhang /* allocate matrix C */ 2623b4285af6SJunchao Zhang Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2624b4285af6SJunchao Zhang Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2625b4285af6SJunchao Zhang /* update matC with the new pointers */ 2626b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2627b4285af6SJunchao Zhang Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2628b4285af6SJunchao Zhang 2629b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2630b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2631b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2632b4285af6SJunchao Zhang &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2633b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2634b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2635b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2636b4285af6SJunchao Zhang &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2637b4285af6SJunchao Zhang cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2638b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2639b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2640b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2641b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2642b4285af6SJunchao Zhang ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2643b4285af6SJunchao Zhang } 2644b4285af6SJunchao Zhang #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2645b4285af6SJunchao Zhang size_t bufSize2; 2646fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2647b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2648fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2649fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2650fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2651bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2652fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2653b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2654fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2655fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2656fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2657fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2658b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2659fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2660fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2661fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2662fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2663fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2664fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2665fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2666fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2667bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2668fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2669b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2670fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2671fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2672fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2673fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2674fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2675fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 267600702c57SStefano Zampini ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2677fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2678fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2679fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2680fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2681fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2682fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2683b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2684fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2685fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2686b4285af6SJunchao Zhang #endif 2687fcdce8c4SStefano Zampini #else 2688fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2689b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2690fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2691fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2692fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2693fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2694fcdce8c4SStefano Zampini c->nz = cnz; 2695fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2696fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2697fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2698fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2699fcdce8c4SStefano Zampini 2700fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2701fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2702fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2703fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2704b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2705fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2706fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2707fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2708fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2709fcdce8c4SStefano Zampini #endif 2710fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2711fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2712fcdce8c4SStefano Zampini finalizesym: 2713fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2714fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2715fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2716fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2717fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2718fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2719fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2720fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2721fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2722fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2723fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2724fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2725fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2726fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2727fcdce8c4SStefano Zampini } else { 2728fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2729fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2730fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2731fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2732fcdce8c4SStefano Zampini } 2733fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2734fcdce8c4SStefano Zampini PetscInt r = 0; 2735fcdce8c4SStefano Zampini c->i[0] = 0; 2736fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2737fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2738fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2739fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2740fcdce8c4SStefano Zampini } 2741fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2742fcdce8c4SStefano Zampini } 2743fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2744fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2745fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2746fcdce8c4SStefano Zampini c->maxnz = c->nz; 2747fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2748fcdce8c4SStefano Zampini c->rmax = 0; 2749fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2750fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2751fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2752fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2753fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2754fcdce8c4SStefano Zampini } 2755fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2756fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2757fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2758fcdce8c4SStefano Zampini 2759fcdce8c4SStefano Zampini C->nonzerostate++; 2760fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2761fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2762fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2763fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2764fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2765fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2766fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2767abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2768fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2769fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2770fcdce8c4SStefano Zampini } 2771fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2772fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2773fcdce8c4SStefano Zampini } 2774fcdce8c4SStefano Zampini 2775fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2776fcdce8c4SStefano Zampini 2777fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2778fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2779fcdce8c4SStefano Zampini { 2780fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2781fcdce8c4SStefano Zampini PetscErrorCode ierr; 2782fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2783fcdce8c4SStefano Zampini 2784fcdce8c4SStefano Zampini PetscFunctionBegin; 2785fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2786fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2787abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2788fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2789fcdce8c4SStefano Zampini } 2790fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2791fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2792fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2793fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2794fcdce8c4SStefano Zampini } 2795fcdce8c4SStefano Zampini } 279665e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 279765e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 279865e4b4d4SStefano Zampini switch (product->type) { 279965e4b4d4SStefano Zampini case MATPRODUCT_AB: 280065e4b4d4SStefano Zampini if (product->api_user) { 280165e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 280265e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 280365e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 280465e4b4d4SStefano Zampini } else { 280565e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 280665e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 280765e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 280865e4b4d4SStefano Zampini } 280965e4b4d4SStefano Zampini break; 281065e4b4d4SStefano Zampini case MATPRODUCT_AtB: 281165e4b4d4SStefano Zampini if (product->api_user) { 281265e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 281365e4b4d4SStefano Zampini ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 281465e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 281565e4b4d4SStefano Zampini } else { 281665e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 281765e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 281865e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 281965e4b4d4SStefano Zampini } 282065e4b4d4SStefano Zampini break; 282165e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 282265e4b4d4SStefano Zampini if (product->api_user) { 282365e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 282465e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 282565e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 282665e4b4d4SStefano Zampini } else { 282765e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 282865e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 282965e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 283065e4b4d4SStefano Zampini } 283165e4b4d4SStefano Zampini break; 283265e4b4d4SStefano Zampini case MATPRODUCT_RARt: 283365e4b4d4SStefano Zampini if (product->api_user) { 283465e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 283565e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 283665e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 283765e4b4d4SStefano Zampini } else { 283865e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 283965e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 284065e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 284165e4b4d4SStefano Zampini } 284265e4b4d4SStefano Zampini break; 284365e4b4d4SStefano Zampini case MATPRODUCT_ABC: 284465e4b4d4SStefano Zampini if (product->api_user) { 284565e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 284665e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 284765e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 284865e4b4d4SStefano Zampini } else { 284965e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 285065e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 285165e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 285265e4b4d4SStefano Zampini } 285365e4b4d4SStefano Zampini break; 285465e4b4d4SStefano Zampini default: 285565e4b4d4SStefano Zampini break; 285665e4b4d4SStefano Zampini } 285765e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 285865e4b4d4SStefano Zampini } 285965e4b4d4SStefano Zampini /* dispatch */ 2860fcdce8c4SStefano Zampini if (isdense) { 2861ccdfe979SStefano Zampini switch (product->type) { 2862ccdfe979SStefano Zampini case MATPRODUCT_AB: 2863ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2864ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2865ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2866ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2867fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2868fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2869fcdce8c4SStefano Zampini } else { 2870fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2871fcdce8c4SStefano Zampini } 2872fcdce8c4SStefano Zampini break; 2873fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2874fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2875fcdce8c4SStefano Zampini break; 2876ccdfe979SStefano Zampini default: 2877ccdfe979SStefano Zampini break; 2878ccdfe979SStefano Zampini } 2879fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2880fcdce8c4SStefano Zampini switch (product->type) { 2881fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2882fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2883fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2884fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2885fcdce8c4SStefano Zampini break; 2886fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2887fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2888fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2889fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2890fcdce8c4SStefano Zampini break; 2891fcdce8c4SStefano Zampini default: 2892fcdce8c4SStefano Zampini break; 2893fcdce8c4SStefano Zampini } 2894fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 2895fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2896fcdce8c4SStefano Zampini } 2897ccdfe979SStefano Zampini PetscFunctionReturn(0); 2898ccdfe979SStefano Zampini } 2899ccdfe979SStefano Zampini 29006fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 29019ae82921SPaul Mullowney { 2902b175d8bbSPaul Mullowney PetscErrorCode ierr; 29039ae82921SPaul Mullowney 29049ae82921SPaul Mullowney PetscFunctionBegin; 2905e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2906e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2907e6e9a74fSStefano Zampini } 2908e6e9a74fSStefano Zampini 2909e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2910e6e9a74fSStefano Zampini { 2911e6e9a74fSStefano Zampini PetscErrorCode ierr; 2912e6e9a74fSStefano Zampini 2913e6e9a74fSStefano Zampini PetscFunctionBegin; 2914e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2915e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2916e6e9a74fSStefano Zampini } 2917e6e9a74fSStefano Zampini 2918e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2919e6e9a74fSStefano Zampini { 2920e6e9a74fSStefano Zampini PetscErrorCode ierr; 2921e6e9a74fSStefano Zampini 2922e6e9a74fSStefano Zampini PetscFunctionBegin; 2923e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2924e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2925e6e9a74fSStefano Zampini } 2926e6e9a74fSStefano Zampini 2927e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2928e6e9a74fSStefano Zampini { 2929e6e9a74fSStefano Zampini PetscErrorCode ierr; 2930e6e9a74fSStefano Zampini 2931e6e9a74fSStefano Zampini PetscFunctionBegin; 2932e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 29339ae82921SPaul Mullowney PetscFunctionReturn(0); 29349ae82921SPaul Mullowney } 29359ae82921SPaul Mullowney 29366fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2937ca45077fSPaul Mullowney { 2938b175d8bbSPaul Mullowney PetscErrorCode ierr; 2939ca45077fSPaul Mullowney 2940ca45077fSPaul Mullowney PetscFunctionBegin; 2941e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2942ca45077fSPaul Mullowney PetscFunctionReturn(0); 2943ca45077fSPaul Mullowney } 2944ca45077fSPaul Mullowney 2945a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2946a0e72f99SJunchao Zhang { 2947a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2948a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2949a0e72f99SJunchao Zhang } 2950a0e72f99SJunchao Zhang 2951afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2952e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 29539ae82921SPaul Mullowney { 29549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2955aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 29569ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2957e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2958b175d8bbSPaul Mullowney PetscErrorCode ierr; 2959aa372e3fSPaul Mullowney cusparseStatus_t stat; 2960e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2961e6e9a74fSStefano Zampini PetscBool compressed; 2962afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2963afb2bd1cSJunchao Zhang PetscInt nx,ny; 2964afb2bd1cSJunchao Zhang #endif 29656e111a19SKarl Rupp 29669ae82921SPaul Mullowney PetscFunctionBegin; 2967e8d2b73aSMark Adams if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2968e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2969afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2970d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2971e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2972e6e9a74fSStefano Zampini } 297334d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 297434d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2975e6e9a74fSStefano Zampini if (!trans) { 29769ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2977e8d2b73aSMark Adams if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2978e6e9a74fSStefano Zampini } else { 29791a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 2980e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2981e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2982e6e9a74fSStefano Zampini } else { 29833606e59fSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 2984e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2985e6e9a74fSStefano Zampini } 2986e6e9a74fSStefano Zampini } 2987e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2988e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2989213423ffSJunchao Zhang 2990e6e9a74fSStefano Zampini try { 2991e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2992213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2993213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2994afb2bd1cSJunchao Zhang 299585ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2996e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2997afb2bd1cSJunchao Zhang /* z = A x + beta y. 2998afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2999afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3000afb2bd1cSJunchao Zhang */ 3001e6e9a74fSStefano Zampini xptr = xarray; 3002afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3003213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3004afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3005afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3006afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3007afb2bd1cSJunchao Zhang */ 3008afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3009afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3010afb2bd1cSJunchao Zhang nx = mat->num_cols; 3011afb2bd1cSJunchao Zhang ny = mat->num_rows; 3012afb2bd1cSJunchao Zhang } 3013afb2bd1cSJunchao Zhang #endif 3014e6e9a74fSStefano Zampini } else { 3015afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3016afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3017afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3018afb2bd1cSJunchao Zhang */ 3019afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3020e6e9a74fSStefano Zampini dptr = zarray; 3021e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3022afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3023e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3024a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3025e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3026e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3027e6e9a74fSStefano Zampini } 3028afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3029afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3030afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3031afb2bd1cSJunchao Zhang nx = mat->num_rows; 3032afb2bd1cSJunchao Zhang ny = mat->num_cols; 3033afb2bd1cSJunchao Zhang } 3034afb2bd1cSJunchao Zhang #endif 3035e6e9a74fSStefano Zampini } 30369ae82921SPaul Mullowney 3037afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3038aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3039afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3040afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3041afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3042ee7b52eaSHong Zhang cudaError_t cerr; 3043afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3044afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3045afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3046afb2bd1cSJunchao Zhang matstruct->matDescr, 3047afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3048afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3049afb2bd1cSJunchao Zhang cusparse_scalartype, 3050afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3051afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3052afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3053afb2bd1cSJunchao Zhang 3054afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3055afb2bd1cSJunchao Zhang } else { 3056afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3057afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3058afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3059afb2bd1cSJunchao Zhang } 3060afb2bd1cSJunchao Zhang 3061afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 3062afb2bd1cSJunchao Zhang matstruct->alpha_one, 30633606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3064afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3065afb2bd1cSJunchao Zhang beta, 3066afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3067afb2bd1cSJunchao Zhang cusparse_scalartype, 3068afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3069afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3070afb2bd1cSJunchao Zhang #else 30717656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3072e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3073a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3074afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3075aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3076e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 307757d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3078afb2bd1cSJunchao Zhang #endif 3079aa372e3fSPaul Mullowney } else { 3080213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3081afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3082afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3083afb2bd1cSJunchao Zhang #else 3084301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3085e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3086afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3087e6e9a74fSStefano Zampini xptr, beta, 308857d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3089afb2bd1cSJunchao Zhang #endif 3090a65300a6SPaul Mullowney } 3091aa372e3fSPaul Mullowney } 3092958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3093aa372e3fSPaul Mullowney 3094e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3095213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3096213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3097213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3098e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3099213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 31007656d835SStefano Zampini } 3101213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3102c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 31037656d835SStefano Zampini } 31047656d835SStefano Zampini 3105213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3106213423ffSJunchao Zhang if (compressed) { 3107e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3108a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3109a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3110a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3111a0e72f99SJunchao Zhang */ 3112a0e72f99SJunchao Zhang #if 0 3113a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3114a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3115a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3116e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3117c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3118a0e72f99SJunchao Zhang #else 3119a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3120a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3121a0e72f99SJunchao Zhang #endif 3122958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3123e6e9a74fSStefano Zampini } 3124e6e9a74fSStefano Zampini } else { 3125e6e9a74fSStefano Zampini if (yy && yy != zz) { 3126e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3127e6e9a74fSStefano Zampini } 3128e6e9a74fSStefano Zampini } 3129e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3130213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3131213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 31329ae82921SPaul Mullowney } catch(char *ex) { 31339ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 31349ae82921SPaul Mullowney } 3135e6e9a74fSStefano Zampini if (yy) { 3136958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3137e6e9a74fSStefano Zampini } else { 3138e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3139e6e9a74fSStefano Zampini } 31409ae82921SPaul Mullowney PetscFunctionReturn(0); 31419ae82921SPaul Mullowney } 31429ae82921SPaul Mullowney 31436fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3144ca45077fSPaul Mullowney { 3145b175d8bbSPaul Mullowney PetscErrorCode ierr; 31466e111a19SKarl Rupp 3147ca45077fSPaul Mullowney PetscFunctionBegin; 3148e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3149ca45077fSPaul Mullowney PetscFunctionReturn(0); 3150ca45077fSPaul Mullowney } 3151ca45077fSPaul Mullowney 31526fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 31539ae82921SPaul Mullowney { 31549ae82921SPaul Mullowney PetscErrorCode ierr; 3155042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3156042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 31573fa6b06aSMark Adams 3158042217e8SBarry Smith PetscFunctionBegin; 3159042217e8SBarry Smith ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3160042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3161042217e8SBarry Smith cudaError_t cerr; 3162042217e8SBarry Smith 3163042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3164042217e8SBarry Smith cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3165042217e8SBarry Smith cusp->deviceMat = NULL; 3166042217e8SBarry Smith } 31679ae82921SPaul Mullowney PetscFunctionReturn(0); 31689ae82921SPaul Mullowney } 31699ae82921SPaul Mullowney 31709ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3171e057df02SPaul Mullowney /*@ 31729ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3173e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3174e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3175e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3176e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3177e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 31789ae82921SPaul Mullowney 3179d083f849SBarry Smith Collective 31809ae82921SPaul Mullowney 31819ae82921SPaul Mullowney Input Parameters: 31829ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 31839ae82921SPaul Mullowney . m - number of rows 31849ae82921SPaul Mullowney . n - number of columns 31859ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 31869ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 31870298fd71SBarry Smith (possibly different for each row) or NULL 31889ae82921SPaul Mullowney 31899ae82921SPaul Mullowney Output Parameter: 31909ae82921SPaul Mullowney . A - the matrix 31919ae82921SPaul Mullowney 31929ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 31939ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 31949ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 31959ae82921SPaul Mullowney 31969ae82921SPaul Mullowney Notes: 31979ae82921SPaul Mullowney If nnz is given then nz is ignored 31989ae82921SPaul Mullowney 31999ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 32009ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 32019ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 32029ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 32039ae82921SPaul Mullowney 32049ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 32050298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 32069ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 32079ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 32089ae82921SPaul Mullowney 32099ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 32109ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 32119ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 32129ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 32139ae82921SPaul Mullowney 32149ae82921SPaul Mullowney Level: intermediate 32159ae82921SPaul Mullowney 3216e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 32179ae82921SPaul Mullowney @*/ 32189ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 32199ae82921SPaul Mullowney { 32209ae82921SPaul Mullowney PetscErrorCode ierr; 32219ae82921SPaul Mullowney 32229ae82921SPaul Mullowney PetscFunctionBegin; 32239ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 32249ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 32259ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32269ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 32279ae82921SPaul Mullowney PetscFunctionReturn(0); 32289ae82921SPaul Mullowney } 32299ae82921SPaul Mullowney 32306fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 32319ae82921SPaul Mullowney { 32329ae82921SPaul Mullowney PetscErrorCode ierr; 3233ab25e6cbSDominic Meiser 32349ae82921SPaul Mullowney PetscFunctionBegin; 32359ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3236470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 32379ae82921SPaul Mullowney } else { 3238470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3239aa372e3fSPaul Mullowney } 3240c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3241ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3242ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3243ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3244fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3245ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 32467e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 32477e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3248ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 32499ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 32509ae82921SPaul Mullowney PetscFunctionReturn(0); 32519ae82921SPaul Mullowney } 32529ae82921SPaul Mullowney 3253ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 325495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 32559ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 32569ff858a8SKarl Rupp { 32579ff858a8SKarl Rupp PetscErrorCode ierr; 32589ff858a8SKarl Rupp 32599ff858a8SKarl Rupp PetscFunctionBegin; 32609ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3261ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 32629ff858a8SKarl Rupp PetscFunctionReturn(0); 32639ff858a8SKarl Rupp } 32649ff858a8SKarl Rupp 3265039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 326695639643SRichard Tran Mills { 3267e6e9a74fSStefano Zampini PetscErrorCode ierr; 3268a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3269039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3270039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3271039c6fbaSStefano Zampini PetscScalar *ay; 3272039c6fbaSStefano Zampini const PetscScalar *ax; 3273039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3274e6e9a74fSStefano Zampini 327595639643SRichard Tran Mills PetscFunctionBegin; 3276a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3277a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3278039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3279a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3280a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3281a587d139SMark PetscFunctionReturn(0); 328295639643SRichard Tran Mills } 3283039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3284a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3285a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3286e8d2b73aSMark Adams if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3287e8d2b73aSMark Adams if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3288039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3289039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3290039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3291039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3292039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3293039c6fbaSStefano Zampini if (eq) { 3294039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3295039c6fbaSStefano Zampini } 3296039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3297039c6fbaSStefano Zampini } 3298d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3299d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3300039c6fbaSStefano Zampini 3301039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3302039c6fbaSStefano Zampini cusparseStatus_t stat; 3303039c6fbaSStefano Zampini PetscScalar b = 1.0; 3304039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3305039c6fbaSStefano Zampini size_t bufferSize; 3306039c6fbaSStefano Zampini void *buffer; 3307ee7b52eaSHong Zhang cudaError_t cerr; 3308039c6fbaSStefano Zampini #endif 3309039c6fbaSStefano Zampini 3310039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3311039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3312039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3313039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3314039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3315039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3316039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3317039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3318039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3319039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3320039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3321039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3322039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3323039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3324039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3325039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3326039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3327039c6fbaSStefano Zampini #else 3328039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3329039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3330039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3331039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3332039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3333039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3334039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3335039c6fbaSStefano Zampini #endif 3336039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3337039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3338039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3339039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3340039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3341a587d139SMark cublasHandle_t cublasv2handle; 3342039c6fbaSStefano Zampini cublasStatus_t berr; 3343a587d139SMark PetscBLASInt one = 1, bnz = 1; 3344039c6fbaSStefano Zampini 3345039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3346039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3347a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3348a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3349a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3350039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3351a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3352a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3353039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3354039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3355a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3356039c6fbaSStefano Zampini } else { 3357a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3358d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3359a587d139SMark } 336095639643SRichard Tran Mills PetscFunctionReturn(0); 336195639643SRichard Tran Mills } 336295639643SRichard Tran Mills 336333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 336433c9ba73SStefano Zampini { 336533c9ba73SStefano Zampini PetscErrorCode ierr; 336633c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 336733c9ba73SStefano Zampini PetscScalar *ay; 336833c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 336933c9ba73SStefano Zampini cublasStatus_t berr; 337033c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 337133c9ba73SStefano Zampini 337233c9ba73SStefano Zampini PetscFunctionBegin; 337333c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 337433c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 337533c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 337633c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 337733c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 337833c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 337933c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 338033c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 338133c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 338233c9ba73SStefano Zampini PetscFunctionReturn(0); 338333c9ba73SStefano Zampini } 338433c9ba73SStefano Zampini 33853fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 33863fa6b06aSMark Adams { 33873fa6b06aSMark Adams PetscErrorCode ierr; 33887e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3389a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 33907e8381f9SStefano Zampini 33913fa6b06aSMark Adams PetscFunctionBegin; 33923fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 33933fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 33947e8381f9SStefano Zampini if (spptr->mat) { 33957e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 33967e8381f9SStefano Zampini if (matrix->values) { 33977e8381f9SStefano Zampini both = PETSC_TRUE; 33987e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 33997e8381f9SStefano Zampini } 34007e8381f9SStefano Zampini } 34017e8381f9SStefano Zampini if (spptr->matTranspose) { 34027e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 34037e8381f9SStefano Zampini if (matrix->values) { 34047e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34057e8381f9SStefano Zampini } 34067e8381f9SStefano Zampini } 34073fa6b06aSMark Adams } 3408a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3409a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3410a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 34117e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3412a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 34133fa6b06aSMark Adams PetscFunctionReturn(0); 34143fa6b06aSMark Adams } 34153fa6b06aSMark Adams 3416a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3417a587d139SMark { 3418a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3419a587d139SMark PetscErrorCode ierr; 3420a587d139SMark 3421a587d139SMark PetscFunctionBegin; 3422a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3423a587d139SMark if (flg) { 3424a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3425a587d139SMark 342633c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3427a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3428a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3429a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3430a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3431a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3432a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3433a587d139SMark A->ops->multhermitiantranspose = NULL; 3434a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3435fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3436c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3437a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3438a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3439a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3440a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3441a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3442fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3443a587d139SMark } else { 344433c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3445a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3446a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3447a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3448a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3449a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3450a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3451a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3452a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3453fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3454c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3455a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3456a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3457a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3458a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3459a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3460fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3461a587d139SMark } 3462a587d139SMark A->boundtocpu = flg; 3463a587d139SMark a->inode.use = flg; 3464a587d139SMark PetscFunctionReturn(0); 3465a587d139SMark } 3466a587d139SMark 346749735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 34689ae82921SPaul Mullowney { 34699ae82921SPaul Mullowney PetscErrorCode ierr; 3470aa372e3fSPaul Mullowney cusparseStatus_t stat; 347149735bf3SStefano Zampini Mat B; 34729ae82921SPaul Mullowney 34739ae82921SPaul Mullowney PetscFunctionBegin; 3474832b2c02SStefano Zampini ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 347549735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 347649735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 347749735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 347849735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 347949735bf3SStefano Zampini } 348049735bf3SStefano Zampini B = *newmat; 348149735bf3SStefano Zampini 348234136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 348334136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 348434136279SStefano Zampini 348549735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 34869ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3487e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3488e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3489e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3490a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 34911a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3492d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3493a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3494a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3495a435da06SStefano Zampini #else 3496d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3497a435da06SStefano Zampini #endif 3498d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3499d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3500d8132acaSStefano Zampini #endif 35011a2c6b5cSJunchao Zhang B->spptr = spptr; 35029ae82921SPaul Mullowney } else { 3503e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3504e6e9a74fSStefano Zampini 3505e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3506e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3507a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3508e6e9a74fSStefano Zampini B->spptr = spptr; 35099ae82921SPaul Mullowney } 3510e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 351149735bf3SStefano Zampini } 3512693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 35139ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 35141a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 35159ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 351695639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3517693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 35182205254eSKarl Rupp 3519e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 35209ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3521bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3522ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 3523ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3524ae48a8d0SStefano Zampini #endif 35259ae82921SPaul Mullowney PetscFunctionReturn(0); 35269ae82921SPaul Mullowney } 35279ae82921SPaul Mullowney 352802fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 352902fe1965SBarry Smith { 353002fe1965SBarry Smith PetscErrorCode ierr; 353102fe1965SBarry Smith 353202fe1965SBarry Smith PetscFunctionBegin; 353302fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 35340ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 353502fe1965SBarry Smith PetscFunctionReturn(0); 353602fe1965SBarry Smith } 353702fe1965SBarry Smith 35383ca39a21SBarry Smith /*MC 3539e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3540e057df02SPaul Mullowney 3541e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 35422692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 35432692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3544e057df02SPaul Mullowney 3545e057df02SPaul Mullowney Options Database Keys: 3546e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3547aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3548a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3549e057df02SPaul Mullowney 3550e057df02SPaul Mullowney Level: beginner 3551e057df02SPaul Mullowney 35528468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3553e057df02SPaul Mullowney M*/ 35547f756511SDominic Meiser 3555bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 35560f39cd5aSBarry Smith 35573ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 355842c9c57cSBarry Smith { 355942c9c57cSBarry Smith PetscErrorCode ierr; 356042c9c57cSBarry Smith 356142c9c57cSBarry Smith PetscFunctionBegin; 3562bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 35633ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 35643ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 35653ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 35663ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3567bddcd29dSMark Adams 356842c9c57cSBarry Smith PetscFunctionReturn(0); 356942c9c57cSBarry Smith } 357029b38603SBarry Smith 3571470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 35727f756511SDominic Meiser { 3573e6e9a74fSStefano Zampini PetscErrorCode ierr; 35747f756511SDominic Meiser cusparseStatus_t stat; 35757f756511SDominic Meiser 35767f756511SDominic Meiser PetscFunctionBegin; 35777f756511SDominic Meiser if (*cusparsestruct) { 3578e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3579e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 35807f756511SDominic Meiser delete (*cusparsestruct)->workVector; 358181902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 35827e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 35837e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3584a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 35857e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3586e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 35877f756511SDominic Meiser } 35887f756511SDominic Meiser PetscFunctionReturn(0); 35897f756511SDominic Meiser } 35907f756511SDominic Meiser 35917f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 35927f756511SDominic Meiser { 35937f756511SDominic Meiser PetscFunctionBegin; 35947f756511SDominic Meiser if (*mat) { 35957f756511SDominic Meiser delete (*mat)->values; 35967f756511SDominic Meiser delete (*mat)->column_indices; 35977f756511SDominic Meiser delete (*mat)->row_offsets; 35987f756511SDominic Meiser delete *mat; 35997f756511SDominic Meiser *mat = 0; 36007f756511SDominic Meiser } 36017f756511SDominic Meiser PetscFunctionReturn(0); 36027f756511SDominic Meiser } 36037f756511SDominic Meiser 3604470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 36057f756511SDominic Meiser { 36067f756511SDominic Meiser cusparseStatus_t stat; 36077f756511SDominic Meiser PetscErrorCode ierr; 36087f756511SDominic Meiser 36097f756511SDominic Meiser PetscFunctionBegin; 36107f756511SDominic Meiser if (*trifactor) { 361157d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3612afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 36137f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 36141b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 36152cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3616afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 36171b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3618afb2bd1cSJunchao Zhang #endif 3619da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 36207f756511SDominic Meiser } 36217f756511SDominic Meiser PetscFunctionReturn(0); 36227f756511SDominic Meiser } 36237f756511SDominic Meiser 3624470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 36257f756511SDominic Meiser { 36267f756511SDominic Meiser CsrMatrix *mat; 36277f756511SDominic Meiser cusparseStatus_t stat; 36287f756511SDominic Meiser cudaError_t err; 36297f756511SDominic Meiser 36307f756511SDominic Meiser PetscFunctionBegin; 36317f756511SDominic Meiser if (*matstruct) { 36327f756511SDominic Meiser if ((*matstruct)->mat) { 36337f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3634afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3635afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3636afb2bd1cSJunchao Zhang #else 36377f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 363857d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3639afb2bd1cSJunchao Zhang #endif 36407f756511SDominic Meiser } else { 36417f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 36427f756511SDominic Meiser CsrMatrix_Destroy(&mat); 36437f756511SDominic Meiser } 36447f756511SDominic Meiser } 364557d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 36467f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3647afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 36487656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 36497656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3650afb2bd1cSJunchao Zhang 3651afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3652afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3653afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3654afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3655afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3656afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3657afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3658afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3659afb2bd1cSJunchao Zhang } 3660afb2bd1cSJunchao Zhang } 3661afb2bd1cSJunchao Zhang #endif 36627f756511SDominic Meiser delete *matstruct; 36637e8381f9SStefano Zampini *matstruct = NULL; 36647f756511SDominic Meiser } 36657f756511SDominic Meiser PetscFunctionReturn(0); 36667f756511SDominic Meiser } 36677f756511SDominic Meiser 3668e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 36697f756511SDominic Meiser { 3670e6e9a74fSStefano Zampini PetscErrorCode ierr; 3671e6e9a74fSStefano Zampini 36727f756511SDominic Meiser PetscFunctionBegin; 36737f756511SDominic Meiser if (*trifactors) { 3674e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3675e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3676e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3677e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 36787f756511SDominic Meiser delete (*trifactors)->rpermIndices; 36797f756511SDominic Meiser delete (*trifactors)->cpermIndices; 36807f756511SDominic Meiser delete (*trifactors)->workVector; 36817e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 36827e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 36837e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3684bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3685bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3686e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3687ccdfe979SStefano Zampini } 3688ccdfe979SStefano Zampini PetscFunctionReturn(0); 3689ccdfe979SStefano Zampini } 3690ccdfe979SStefano Zampini 3691ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3692ccdfe979SStefano Zampini { 3693e6e9a74fSStefano Zampini PetscErrorCode ierr; 3694ccdfe979SStefano Zampini cusparseHandle_t handle; 3695ccdfe979SStefano Zampini cusparseStatus_t stat; 3696ccdfe979SStefano Zampini 3697ccdfe979SStefano Zampini PetscFunctionBegin; 3698ccdfe979SStefano Zampini if (*trifactors) { 3699e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 37007f756511SDominic Meiser if (handle = (*trifactors)->handle) { 370157d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 37027f756511SDominic Meiser } 3703e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 37047f756511SDominic Meiser } 37057f756511SDominic Meiser PetscFunctionReturn(0); 37067f756511SDominic Meiser } 37077e8381f9SStefano Zampini 37087e8381f9SStefano Zampini struct IJCompare 37097e8381f9SStefano Zampini { 37107e8381f9SStefano Zampini __host__ __device__ 37117e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37127e8381f9SStefano Zampini { 37137e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 37147e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 37157e8381f9SStefano Zampini return false; 37167e8381f9SStefano Zampini } 37177e8381f9SStefano Zampini }; 37187e8381f9SStefano Zampini 37197e8381f9SStefano Zampini struct IJEqual 37207e8381f9SStefano Zampini { 37217e8381f9SStefano Zampini __host__ __device__ 37227e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37237e8381f9SStefano Zampini { 37247e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 37257e8381f9SStefano Zampini return true; 37267e8381f9SStefano Zampini } 37277e8381f9SStefano Zampini }; 37287e8381f9SStefano Zampini 37297e8381f9SStefano Zampini struct IJDiff 37307e8381f9SStefano Zampini { 37317e8381f9SStefano Zampini __host__ __device__ 37327e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37337e8381f9SStefano Zampini { 37347e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 37357e8381f9SStefano Zampini } 37367e8381f9SStefano Zampini }; 37377e8381f9SStefano Zampini 37387e8381f9SStefano Zampini struct IJSum 37397e8381f9SStefano Zampini { 37407e8381f9SStefano Zampini __host__ __device__ 37417e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37427e8381f9SStefano Zampini { 37437e8381f9SStefano Zampini return t1||t2; 37447e8381f9SStefano Zampini } 37457e8381f9SStefano Zampini }; 37467e8381f9SStefano Zampini 37477e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3748e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 37497e8381f9SStefano Zampini { 37507e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3751fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3752bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 375308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 37547e8381f9SStefano Zampini CsrMatrix *matrix; 37557e8381f9SStefano Zampini PetscErrorCode ierr; 37567e8381f9SStefano Zampini PetscInt n; 37577e8381f9SStefano Zampini 37587e8381f9SStefano Zampini PetscFunctionBegin; 37597e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 37607e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 37617e8381f9SStefano Zampini if (!cusp->cooPerm) { 37627e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 37637e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 37647e8381f9SStefano Zampini PetscFunctionReturn(0); 37657e8381f9SStefano Zampini } 37667e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 37677e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3768e61fc153SStefano Zampini if (!v) { 3769e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3770e61fc153SStefano Zampini goto finalize; 37717e8381f9SStefano Zampini } 3772e61fc153SStefano Zampini n = cusp->cooPerm->size(); 377308391a17SStefano Zampini if (isCudaMem(v)) { 377408391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 377508391a17SStefano Zampini } else { 3776e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3777e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 377808391a17SStefano Zampini d_v = cooPerm_v->data(); 3779e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 378008391a17SStefano Zampini } 3781bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3782e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3783ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3784bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 378508391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3786ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3787ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3788ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3789ddea5d60SJunchao Zhang */ 3790e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3791e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3792e61fc153SStefano Zampini delete cooPerm_w; 37937e8381f9SStefano Zampini } else { 3794ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 379508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 37967e8381f9SStefano Zampini matrix->values->begin())); 379708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 37987e8381f9SStefano Zampini matrix->values->end())); 3799ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 38007e8381f9SStefano Zampini } 38017e8381f9SStefano Zampini } else { 3802e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 380308391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3804e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 38057e8381f9SStefano Zampini } else { 380608391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38077e8381f9SStefano Zampini matrix->values->begin())); 380808391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38097e8381f9SStefano Zampini matrix->values->end())); 38107e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 38117e8381f9SStefano Zampini } 38127e8381f9SStefano Zampini } 3813bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3814e61fc153SStefano Zampini finalize: 3815e61fc153SStefano Zampini delete cooPerm_v; 38167e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3817e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3818fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3819fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3820fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3821fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3822fcdce8c4SStefano Zampini a->reallocs = 0; 3823fcdce8c4SStefano Zampini A->info.mallocs += 0; 3824fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3825fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3826fcdce8c4SStefano Zampini A->num_ass++; 38277e8381f9SStefano Zampini PetscFunctionReturn(0); 38287e8381f9SStefano Zampini } 38297e8381f9SStefano Zampini 3830a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3831a49f1ed0SStefano Zampini { 3832a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3833a49f1ed0SStefano Zampini PetscErrorCode ierr; 3834a49f1ed0SStefano Zampini 3835a49f1ed0SStefano Zampini PetscFunctionBegin; 3836a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3837a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3838a49f1ed0SStefano Zampini if (destroy) { 3839a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3840a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3841a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3842a49f1ed0SStefano Zampini } 38431a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3844a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3845a49f1ed0SStefano Zampini } 3846a49f1ed0SStefano Zampini 38477e8381f9SStefano Zampini #include <thrust/binary_search.h> 3848e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 38497e8381f9SStefano Zampini { 38507e8381f9SStefano Zampini PetscErrorCode ierr; 38517e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 38527e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 38537e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 38547e8381f9SStefano Zampini cudaError_t cerr; 38557e8381f9SStefano Zampini 38567e8381f9SStefano Zampini PetscFunctionBegin; 38577e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 38587e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 38597e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 38607e8381f9SStefano Zampini if (n != cooPerm_n) { 38617e8381f9SStefano Zampini delete cusp->cooPerm; 38627e8381f9SStefano Zampini delete cusp->cooPerm_a; 38637e8381f9SStefano Zampini cusp->cooPerm = NULL; 38647e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 38657e8381f9SStefano Zampini } 38667e8381f9SStefano Zampini if (n) { 38677e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 38687e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 38697e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 38707e8381f9SStefano Zampini 38717e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 38727e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 38737e8381f9SStefano Zampini 38747e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 38757e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 38767e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 3877ddea5d60SJunchao Zhang 3878ddea5d60SJunchao Zhang /* Ex. 3879ddea5d60SJunchao Zhang n = 6 3880ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 3881ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 3882ddea5d60SJunchao Zhang */ 38837e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 38847e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 38857e8381f9SStefano Zampini 388608391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 38877e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3888ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3889ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 38907e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 38917e8381f9SStefano Zampini 3892ddea5d60SJunchao Zhang /* 3893ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 3894ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 3895ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 3896ddea5d60SJunchao Zhang */ 3897ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3898ddea5d60SJunchao Zhang 3899ddea5d60SJunchao Zhang /* 3900ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 3901ddea5d60SJunchao Zhang ^ekey 3902ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 3903ddea5d60SJunchao Zhang ^nekye 3904ddea5d60SJunchao Zhang */ 39057e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 39067e8381f9SStefano Zampini delete cusp->cooPerm_a; 39077e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 3908ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3909ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3910ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3911ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3912ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 39137e8381f9SStefano Zampini w[0] = 0; 3914ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3915ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 39167e8381f9SStefano Zampini } 39177e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 3918ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3919ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3920ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 392108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 39227e8381f9SStefano Zampini 39237e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 39247e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 39257e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 39267e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 39277e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3928ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 39297e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 39307e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3931fcdce8c4SStefano Zampini a->rmax = 0; 39327e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 39337e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 39347e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 39357e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 39367e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 39377e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 39387e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 39397e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 39407e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3941fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 39427e8381f9SStefano Zampini } 3943fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 39447e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 39457e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3946fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 39477e8381f9SStefano Zampini } else { 39487e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 39497e8381f9SStefano Zampini } 3950e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 39517e8381f9SStefano Zampini 39527e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3953e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 3954e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 39557e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 39567e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 39577e8381f9SStefano Zampini A->nonzerostate++; 39587e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3959a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 39607e8381f9SStefano Zampini 39617e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 39627e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 39637e8381f9SStefano Zampini PetscFunctionReturn(0); 39647e8381f9SStefano Zampini } 3965ed502f03SStefano Zampini 39665b7e41feSStefano Zampini /*@C 39675b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 39685b7e41feSStefano Zampini 39695b7e41feSStefano Zampini Not collective 39705b7e41feSStefano Zampini 39715b7e41feSStefano Zampini Input Parameters: 39725b7e41feSStefano Zampini + A - the matrix 39735b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 39745b7e41feSStefano Zampini 39755b7e41feSStefano Zampini Output Parameters: 39765b7e41feSStefano Zampini + ia - the CSR row pointers 39775b7e41feSStefano Zampini - ja - the CSR column indices 39785b7e41feSStefano Zampini 39795b7e41feSStefano Zampini Level: developer 39805b7e41feSStefano Zampini 39815b7e41feSStefano Zampini Notes: 39825b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 39835b7e41feSStefano Zampini 39845b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 39855b7e41feSStefano Zampini @*/ 39865f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 39875f101d05SStefano Zampini { 39885f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 39895f101d05SStefano Zampini CsrMatrix *csr; 39905f101d05SStefano Zampini PetscErrorCode ierr; 39915f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 39925f101d05SStefano Zampini 39935f101d05SStefano Zampini PetscFunctionBegin; 39945f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 39955f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 39965f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 39975f101d05SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 39985f101d05SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 39995f101d05SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 40005f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 40015f101d05SStefano Zampini if (i) { 40025f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 40035f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 40045f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 40055f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 40065f101d05SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 40075f101d05SStefano Zampini } 40085f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 40095f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 40105f101d05SStefano Zampini } 40115f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 40125f101d05SStefano Zampini PetscFunctionReturn(0); 40135f101d05SStefano Zampini } 40145f101d05SStefano Zampini 40155b7e41feSStefano Zampini /*@C 40165b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 40175b7e41feSStefano Zampini 40185b7e41feSStefano Zampini Not collective 40195b7e41feSStefano Zampini 40205b7e41feSStefano Zampini Input Parameters: 40215b7e41feSStefano Zampini + A - the matrix 40225b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40235b7e41feSStefano Zampini 40245b7e41feSStefano Zampini Output Parameters: 40255b7e41feSStefano Zampini + ia - the CSR row pointers 40265b7e41feSStefano Zampini - ja - the CSR column indices 40275b7e41feSStefano Zampini 40285b7e41feSStefano Zampini Level: developer 40295b7e41feSStefano Zampini 40305b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 40315b7e41feSStefano Zampini @*/ 40325f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 40335f101d05SStefano Zampini { 40345f101d05SStefano Zampini PetscFunctionBegin; 40355f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 40365f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 40375f101d05SStefano Zampini if (i) *i = NULL; 40385f101d05SStefano Zampini if (j) *j = NULL; 40395f101d05SStefano Zampini PetscFunctionReturn(0); 40405f101d05SStefano Zampini } 40415f101d05SStefano Zampini 40425b7e41feSStefano Zampini /*@C 40435b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 40445b7e41feSStefano Zampini 40455b7e41feSStefano Zampini Not Collective 40465b7e41feSStefano Zampini 40475b7e41feSStefano Zampini Input Parameter: 40485b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 40495b7e41feSStefano Zampini 40505b7e41feSStefano Zampini Output Parameter: 40515b7e41feSStefano Zampini . a - pointer to the device data 40525b7e41feSStefano Zampini 40535b7e41feSStefano Zampini Level: developer 40545b7e41feSStefano Zampini 40555b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 40565b7e41feSStefano Zampini 40575b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 40585b7e41feSStefano Zampini @*/ 4059ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4060ed502f03SStefano Zampini { 4061ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4062ed502f03SStefano Zampini CsrMatrix *csr; 4063ed502f03SStefano Zampini PetscErrorCode ierr; 4064ed502f03SStefano Zampini 4065ed502f03SStefano Zampini PetscFunctionBegin; 4066ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4067ed502f03SStefano Zampini PetscValidPointer(a,2); 4068ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4069ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4070ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 407133c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4072ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4073ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4074ed502f03SStefano Zampini *a = csr->values->data().get(); 4075ed502f03SStefano Zampini PetscFunctionReturn(0); 4076ed502f03SStefano Zampini } 4077ed502f03SStefano Zampini 40785b7e41feSStefano Zampini /*@C 40795b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 40805b7e41feSStefano Zampini 40815b7e41feSStefano Zampini Not Collective 40825b7e41feSStefano Zampini 40835b7e41feSStefano Zampini Input Parameter: 40845b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 40855b7e41feSStefano Zampini 40865b7e41feSStefano Zampini Output Parameter: 40875b7e41feSStefano Zampini . a - pointer to the device data 40885b7e41feSStefano Zampini 40895b7e41feSStefano Zampini Level: developer 40905b7e41feSStefano Zampini 40915b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 40925b7e41feSStefano Zampini @*/ 4093ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4094ed502f03SStefano Zampini { 4095ed502f03SStefano Zampini PetscFunctionBegin; 4096ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4097ed502f03SStefano Zampini PetscValidPointer(a,2); 4098ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4099ed502f03SStefano Zampini *a = NULL; 4100ed502f03SStefano Zampini PetscFunctionReturn(0); 4101ed502f03SStefano Zampini } 4102ed502f03SStefano Zampini 41035b7e41feSStefano Zampini /*@C 41045b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41055b7e41feSStefano Zampini 41065b7e41feSStefano Zampini Not Collective 41075b7e41feSStefano Zampini 41085b7e41feSStefano Zampini Input Parameter: 41095b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41105b7e41feSStefano Zampini 41115b7e41feSStefano Zampini Output Parameter: 41125b7e41feSStefano Zampini . a - pointer to the device data 41135b7e41feSStefano Zampini 41145b7e41feSStefano Zampini Level: developer 41155b7e41feSStefano Zampini 41165b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41175b7e41feSStefano Zampini 41185b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 41195b7e41feSStefano Zampini @*/ 4120039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4121039c6fbaSStefano Zampini { 4122039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4123039c6fbaSStefano Zampini CsrMatrix *csr; 4124039c6fbaSStefano Zampini PetscErrorCode ierr; 4125039c6fbaSStefano Zampini 4126039c6fbaSStefano Zampini PetscFunctionBegin; 4127039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4128039c6fbaSStefano Zampini PetscValidPointer(a,2); 4129039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4130039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4131039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 413233c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4133039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4134039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4135039c6fbaSStefano Zampini *a = csr->values->data().get(); 4136039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4137a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4138039c6fbaSStefano Zampini PetscFunctionReturn(0); 4139039c6fbaSStefano Zampini } 41405b7e41feSStefano Zampini /*@C 41415b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4142039c6fbaSStefano Zampini 41435b7e41feSStefano Zampini Not Collective 41445b7e41feSStefano Zampini 41455b7e41feSStefano Zampini Input Parameter: 41465b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41475b7e41feSStefano Zampini 41485b7e41feSStefano Zampini Output Parameter: 41495b7e41feSStefano Zampini . a - pointer to the device data 41505b7e41feSStefano Zampini 41515b7e41feSStefano Zampini Level: developer 41525b7e41feSStefano Zampini 41535b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 41545b7e41feSStefano Zampini @*/ 4155039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4156039c6fbaSStefano Zampini { 4157039c6fbaSStefano Zampini PetscErrorCode ierr; 4158039c6fbaSStefano Zampini 4159039c6fbaSStefano Zampini PetscFunctionBegin; 4160039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4161039c6fbaSStefano Zampini PetscValidPointer(a,2); 4162039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4163039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4164039c6fbaSStefano Zampini *a = NULL; 4165039c6fbaSStefano Zampini PetscFunctionReturn(0); 4166039c6fbaSStefano Zampini } 4167039c6fbaSStefano Zampini 41685b7e41feSStefano Zampini /*@C 41695b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41705b7e41feSStefano Zampini 41715b7e41feSStefano Zampini Not Collective 41725b7e41feSStefano Zampini 41735b7e41feSStefano Zampini Input Parameter: 41745b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41755b7e41feSStefano Zampini 41765b7e41feSStefano Zampini Output Parameter: 41775b7e41feSStefano Zampini . a - pointer to the device data 41785b7e41feSStefano Zampini 41795b7e41feSStefano Zampini Level: developer 41805b7e41feSStefano Zampini 41815b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 41825b7e41feSStefano Zampini 41835b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 41845b7e41feSStefano Zampini @*/ 4185ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4186ed502f03SStefano Zampini { 4187ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4188ed502f03SStefano Zampini CsrMatrix *csr; 4189a49f1ed0SStefano Zampini PetscErrorCode ierr; 4190ed502f03SStefano Zampini 4191ed502f03SStefano Zampini PetscFunctionBegin; 4192ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4193ed502f03SStefano Zampini PetscValidPointer(a,2); 4194ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4195ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 419633c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4197ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4198ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4199ed502f03SStefano Zampini *a = csr->values->data().get(); 4200039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4201a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4202ed502f03SStefano Zampini PetscFunctionReturn(0); 4203ed502f03SStefano Zampini } 4204ed502f03SStefano Zampini 42055b7e41feSStefano Zampini /*@C 42065b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 42075b7e41feSStefano Zampini 42085b7e41feSStefano Zampini Not Collective 42095b7e41feSStefano Zampini 42105b7e41feSStefano Zampini Input Parameter: 42115b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42125b7e41feSStefano Zampini 42135b7e41feSStefano Zampini Output Parameter: 42145b7e41feSStefano Zampini . a - pointer to the device data 42155b7e41feSStefano Zampini 42165b7e41feSStefano Zampini Level: developer 42175b7e41feSStefano Zampini 42185b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 42195b7e41feSStefano Zampini @*/ 4220ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4221ed502f03SStefano Zampini { 4222ed502f03SStefano Zampini PetscErrorCode ierr; 4223ed502f03SStefano Zampini 4224ed502f03SStefano Zampini PetscFunctionBegin; 4225ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4226ed502f03SStefano Zampini PetscValidPointer(a,2); 4227ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4228ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4229ed502f03SStefano Zampini *a = NULL; 4230ed502f03SStefano Zampini PetscFunctionReturn(0); 4231ed502f03SStefano Zampini } 4232ed502f03SStefano Zampini 4233ed502f03SStefano Zampini struct IJCompare4 4234ed502f03SStefano Zampini { 4235ed502f03SStefano Zampini __host__ __device__ 42362ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4237ed502f03SStefano Zampini { 4238ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4239ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4240ed502f03SStefano Zampini return false; 4241ed502f03SStefano Zampini } 4242ed502f03SStefano Zampini }; 4243ed502f03SStefano Zampini 42448909a122SStefano Zampini struct Shift 42458909a122SStefano Zampini { 4246ed502f03SStefano Zampini int _shift; 4247ed502f03SStefano Zampini 4248ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4249ed502f03SStefano Zampini __host__ __device__ 4250ed502f03SStefano Zampini inline int operator() (const int &c) 4251ed502f03SStefano Zampini { 4252ed502f03SStefano Zampini return c + _shift; 4253ed502f03SStefano Zampini } 4254ed502f03SStefano Zampini }; 4255ed502f03SStefano Zampini 4256ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4257ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4258ed502f03SStefano Zampini { 4259ed502f03SStefano Zampini PetscErrorCode ierr; 4260ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4261ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4262ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4263ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4264ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4265ed502f03SStefano Zampini cusparseStatus_t stat; 4266ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4267ed502f03SStefano Zampini cudaError_t cerr; 4268ed502f03SStefano Zampini 4269ed502f03SStefano Zampini PetscFunctionBegin; 4270ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4271ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4272ed502f03SStefano Zampini PetscValidPointer(C,4); 4273ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4274ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4275ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4276ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4277ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4278ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4279ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4280ed502f03SStefano Zampini m = A->rmap->n; 4281ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 4282ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4283ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4284ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4285ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4286ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4287ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4288ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4289ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4290ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4291ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4292ed502f03SStefano Zampini c->compressedrow.i = NULL; 4293ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4294ed502f03SStefano Zampini Ccusp->workVector = NULL; 4295ed502f03SStefano Zampini Ccusp->nrows = m; 4296ed502f03SStefano Zampini Ccusp->mat = Cmat; 4297ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4298ed502f03SStefano Zampini Ccsr->num_rows = m; 4299ed502f03SStefano Zampini Ccsr->num_cols = n; 4300ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4301ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4302ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4303ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4304ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4305ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4306ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4307ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4308ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4309ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4310ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4311ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4312ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4313ed502f03SStefano Zampini 4314ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4315ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4316ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4317ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4318ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4319ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4320ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4321ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4322ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4323ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4324ed502f03SStefano Zampini if (c->nz) { 43252ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 43262ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 43272ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 43282ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 43292ed87e7eSStefano Zampini 4330ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4331ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4332ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4333ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4334ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4335ed502f03SStefano Zampini } 43362ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 43372ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4338ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4339ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4340ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4341ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4342ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4343ed502f03SStefano Zampini } 43442ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 43452ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 4346ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 43472ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 43482ed87e7eSStefano Zampini Aroff->data().get(), 43492ed87e7eSStefano Zampini Annz, 43502ed87e7eSStefano Zampini m, 43512ed87e7eSStefano Zampini Acoo->data().get(), 43522ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4353ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 43542ed87e7eSStefano Zampini Broff->data().get(), 4355ed502f03SStefano Zampini Bnnz, 4356ed502f03SStefano Zampini m, 43572ed87e7eSStefano Zampini Bcoo->data().get(), 4358ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 43592ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 43602ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 43612ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 43628909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4363ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4364ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 43658909a122SStefano Zampini #else 43668909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 43678909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 43688909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 43698909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 43708909a122SStefano Zampini #endif 43712ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 43722ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 43732ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 43742ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 43752ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 43762ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4377ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4378ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4379ed502f03SStefano Zampini thrust::advance(p2,Annz); 43802ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 43818909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 43828909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 43838909a122SStefano Zampini #endif 43842ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 43852ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 43862ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 43872ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 43882ed87e7eSStefano Zampini #else 43892ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 43902ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 43912ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 43922ed87e7eSStefano Zampini #endif 4393ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 43942ed87e7eSStefano Zampini Ccoo->data().get(), 4395ed502f03SStefano Zampini c->nz, 4396ed502f03SStefano Zampini m, 4397ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4398ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4399ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 44002ed87e7eSStefano Zampini delete wPerm; 44012ed87e7eSStefano Zampini delete Acoo; 44022ed87e7eSStefano Zampini delete Bcoo; 44032ed87e7eSStefano Zampini delete Ccoo; 4404ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4405ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4406ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4407ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4408ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4409ed502f03SStefano Zampini #endif 44101a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 44113606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 44123606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4413ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4414ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4415ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4416ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4417ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4418ed502f03SStefano Zampini 44191a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 44201a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4421a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4422ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4423ed502f03SStefano Zampini CmatT->mat = CcsrT; 4424ed502f03SStefano Zampini CcsrT->num_rows = n; 4425ed502f03SStefano Zampini CcsrT->num_cols = m; 4426ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4427ed502f03SStefano Zampini 4428ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4429ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4430ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4431ed502f03SStefano Zampini 4432ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4433ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4434ed502f03SStefano Zampini if (AT) { 4435ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4436ed502f03SStefano Zampini thrust::advance(rT,-1); 4437ed502f03SStefano Zampini } 4438ed502f03SStefano Zampini if (BT) { 4439ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4440ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4441ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4442ed502f03SStefano Zampini } 4443ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4444ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4445ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4446ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4447ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4448ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4449ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4450ed502f03SStefano Zampini 4451ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4452ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4453ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4454ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4455ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4456ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4457ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4458ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4459ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4460ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4461ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4462ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4463ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4464ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4465ed502f03SStefano Zampini #endif 4466ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4467ed502f03SStefano Zampini } 4468ed502f03SStefano Zampini } 4469ed502f03SStefano Zampini 4470ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4471ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4472ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4473ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4474ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4475ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4476ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4477ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4478ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4479ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4480ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4481ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4482ed502f03SStefano Zampini } else { 4483ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4484ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4485ed502f03SStefano Zampini } 4486ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4487ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4488ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4489ed502f03SStefano Zampini c->maxnz = c->nz; 4490ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4491ed502f03SStefano Zampini c->rmax = 0; 4492ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4493ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4494ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4495ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4496ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4497ed502f03SStefano Zampini } 4498ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4499ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4500ed502f03SStefano Zampini (*C)->nonzerostate++; 4501ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4502ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4503ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4504ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4505ed502f03SStefano Zampini } else { 4506ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4507ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4508ed502f03SStefano Zampini if (c->nz) { 4509ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4510ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4511ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4512ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4513ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4514ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4515ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4516ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4517ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4518ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4519ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4520ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4521ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4522ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4523ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4524ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4525ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4526ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4527ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4528ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4529ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4530ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4531ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4532ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4533ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4534ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4535ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4536ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4537ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4538a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 45391a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4540ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4541ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4542ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4543ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4544ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4545ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4546ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4547ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 45481a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4549ed502f03SStefano Zampini } 4550ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4551ed502f03SStefano Zampini } 4552ed502f03SStefano Zampini } 4553ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4554ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4555ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4556ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4557ed502f03SStefano Zampini PetscFunctionReturn(0); 4558ed502f03SStefano Zampini } 4559c215019aSStefano Zampini 4560c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4561c215019aSStefano Zampini { 4562c215019aSStefano Zampini PetscErrorCode ierr; 4563c215019aSStefano Zampini bool dmem; 4564c215019aSStefano Zampini const PetscScalar *av; 4565c215019aSStefano Zampini cudaError_t cerr; 4566c215019aSStefano Zampini 4567c215019aSStefano Zampini PetscFunctionBegin; 4568c215019aSStefano Zampini dmem = isCudaMem(v); 4569c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4570c215019aSStefano Zampini if (n && idx) { 4571c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4572c215019aSStefano Zampini widx.assign(idx,idx+n); 4573c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4574c215019aSStefano Zampini 4575c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4576c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4577c215019aSStefano Zampini if (dmem) { 4578c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4579c215019aSStefano Zampini } else { 4580c215019aSStefano Zampini w = new THRUSTARRAY(n); 4581c215019aSStefano Zampini dv = w->data(); 4582c215019aSStefano Zampini } 4583c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4584c215019aSStefano Zampini 4585c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4586c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4587c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4588c215019aSStefano Zampini if (w) { 4589c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4590c215019aSStefano Zampini } 4591c215019aSStefano Zampini delete w; 4592c215019aSStefano Zampini } else { 4593c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4594c215019aSStefano Zampini } 4595c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4596c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4597c215019aSStefano Zampini PetscFunctionReturn(0); 4598c215019aSStefano Zampini } 4599