19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17e8d2b73aSMark Adams 18e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 19afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 21afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 22afb2bd1cSJunchao Zhang 23afb2bd1cSJunchao Zhang typedef enum { 24afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 25afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 27afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 28afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 29afb2bd1cSJunchao Zhang 30afb2bd1cSJunchao Zhang typedef enum { 31afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 34afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 35afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 43afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 44afb2bd1cSJunchao Zhang 45afb2bd1cSJunchao Zhang typedef enum { 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 47afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 48afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 49afb2bd1cSJunchao Zhang */ 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 52afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 53afb2bd1cSJunchao Zhang #endif 549ae82921SPaul Mullowney 55087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 57087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 58087f3262SPaul Mullowney 596fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 616fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 666fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 674416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 68a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 6933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 706fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 726fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 736fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 75e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 779ae82921SPaul Mullowney 787f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 837f756511SDominic Meiser 8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 86a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 8757181aedSStefano Zampini 887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 897e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 907e8381f9SStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92c215019aSStefano Zampini 93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 94b06137fdSPaul Mullowney { 95b06137fdSPaul Mullowney cusparseStatus_t stat; 96b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97b06137fdSPaul Mullowney 98b06137fdSPaul Mullowney PetscFunctionBegin; 99d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 100b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10157d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 102b06137fdSPaul Mullowney PetscFunctionReturn(0); 103b06137fdSPaul Mullowney } 104b06137fdSPaul Mullowney 105b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 106b06137fdSPaul Mullowney { 107b06137fdSPaul Mullowney cusparseStatus_t stat; 108b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109b06137fdSPaul Mullowney 110b06137fdSPaul Mullowney PetscFunctionBegin; 111d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1126b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11316a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11457d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11516a2e217SAlejandro Lamas Daviña } 116b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1176b1cf21dSAlejandro Lamas Daviña } 11857d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 119b06137fdSPaul Mullowney PetscFunctionReturn(0); 120b06137fdSPaul Mullowney } 121b06137fdSPaul Mullowney 122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123b06137fdSPaul Mullowney { 124b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1257e8381f9SStefano Zampini PetscBool flg; 1267e8381f9SStefano Zampini PetscErrorCode ierr; 127ccdfe979SStefano Zampini 128b06137fdSPaul Mullowney PetscFunctionBegin; 1297e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1307e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 131ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 132b06137fdSPaul Mullowney PetscFunctionReturn(0); 133b06137fdSPaul Mullowney } 134b06137fdSPaul Mullowney 135ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1369ae82921SPaul Mullowney { 1379ae82921SPaul Mullowney PetscFunctionBegin; 1389ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1399ae82921SPaul Mullowney PetscFunctionReturn(0); 1409ae82921SPaul Mullowney } 1419ae82921SPaul Mullowney 142c708e6cdSJed Brown /*MC 143087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 144087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 145087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 146087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 147087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 148087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 149c708e6cdSJed Brown 1509ae82921SPaul Mullowney Level: beginner 151c708e6cdSJed Brown 1523ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 153c708e6cdSJed Brown M*/ 1549ae82921SPaul Mullowney 15542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1569ae82921SPaul Mullowney { 1579ae82921SPaul Mullowney PetscErrorCode ierr; 158bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1599ae82921SPaul Mullowney 1609ae82921SPaul Mullowney PetscFunctionBegin; 161bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 162bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1632c7c0729SBarry Smith (*B)->factortype = ftype; 1649ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1652205254eSKarl Rupp 166087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16733d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1689ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1699ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1704ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1714ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 1724ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 173087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 174087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 175087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1764ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 1774ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 1789ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 179bc3f50f2SPaul Mullowney 180fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1814ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1823ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1839ae82921SPaul Mullowney PetscFunctionReturn(0); 1849ae82921SPaul Mullowney } 1859ae82921SPaul Mullowney 186bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 187ca45077fSPaul Mullowney { 188aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1896e111a19SKarl Rupp 190ca45077fSPaul Mullowney PetscFunctionBegin; 191ca45077fSPaul Mullowney switch (op) { 192e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 193aa372e3fSPaul Mullowney cusparsestruct->format = format; 194ca45077fSPaul Mullowney break; 195e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 196aa372e3fSPaul Mullowney cusparsestruct->format = format; 197ca45077fSPaul Mullowney break; 198ca45077fSPaul Mullowney default: 19936d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 200ca45077fSPaul Mullowney } 201ca45077fSPaul Mullowney PetscFunctionReturn(0); 202ca45077fSPaul Mullowney } 2039ae82921SPaul Mullowney 204e057df02SPaul Mullowney /*@ 205e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 206e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 207aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 208e057df02SPaul Mullowney Not Collective 209e057df02SPaul Mullowney 210e057df02SPaul Mullowney Input Parameters: 2118468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 21236d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2132692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 214e057df02SPaul Mullowney 215e057df02SPaul Mullowney Output Parameter: 216e057df02SPaul Mullowney 217e057df02SPaul Mullowney Level: intermediate 218e057df02SPaul Mullowney 2198468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 220e057df02SPaul Mullowney @*/ 221e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 222e057df02SPaul Mullowney { 223e057df02SPaul Mullowney PetscErrorCode ierr; 2246e111a19SKarl Rupp 225e057df02SPaul Mullowney PetscFunctionBegin; 226e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 227e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 228e057df02SPaul Mullowney PetscFunctionReturn(0); 229e057df02SPaul Mullowney } 230e057df02SPaul Mullowney 2311a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 232e6e9a74fSStefano Zampini { 233e6e9a74fSStefano Zampini PetscErrorCode ierr; 234e6e9a74fSStefano Zampini 235e6e9a74fSStefano Zampini PetscFunctionBegin; 2361a2c6b5cSJunchao Zhang switch (op) { 2371a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2381a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2391a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2401a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2411a2c6b5cSJunchao Zhang break; 2421a2c6b5cSJunchao Zhang default: 2431a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2441a2c6b5cSJunchao Zhang break; 245e6e9a74fSStefano Zampini } 246e6e9a74fSStefano Zampini PetscFunctionReturn(0); 247e6e9a74fSStefano Zampini } 248e6e9a74fSStefano Zampini 249bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 250bddcd29dSMark Adams 251bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 252bddcd29dSMark Adams { 253bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 255bddcd29dSMark Adams PetscBool row_identity,col_identity; 256bddcd29dSMark Adams PetscErrorCode ierr; 257bddcd29dSMark Adams 258bddcd29dSMark Adams PetscFunctionBegin; 259bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 260bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 261bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 262bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 263bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 264bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 265bddcd29dSMark Adams if (row_identity && col_identity) { 266bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 267bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 268bddcd29dSMark Adams B->ops->matsolve = NULL; 269bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 270bddcd29dSMark Adams } else { 271bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 272bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 273bddcd29dSMark Adams B->ops->matsolve = NULL; 274bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 275bddcd29dSMark Adams } 276bddcd29dSMark Adams 277bddcd29dSMark Adams /* get the triangular factors */ 278bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 279bddcd29dSMark Adams PetscFunctionReturn(0); 280bddcd29dSMark Adams } 281bddcd29dSMark Adams 2824416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2839ae82921SPaul Mullowney { 2849ae82921SPaul Mullowney PetscErrorCode ierr; 285e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2869ae82921SPaul Mullowney PetscBool flg; 287a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2886e111a19SKarl Rupp 2899ae82921SPaul Mullowney PetscFunctionBegin; 290e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2919ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 292e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 293a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 294afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 295afb2bd1cSJunchao Zhang 2964c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 297a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 298afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 299afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 300afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 301afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 302afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 303afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304afb2bd1cSJunchao Zhang 305afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 306afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 307afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 308afb2bd1cSJunchao Zhang 309afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 310afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 311afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 312afb2bd1cSJunchao Zhang #endif 3134c87dfd4SPaul Mullowney } 3140af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3159ae82921SPaul Mullowney PetscFunctionReturn(0); 3169ae82921SPaul Mullowney } 3179ae82921SPaul Mullowney 3186fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3199ae82921SPaul Mullowney { 320da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3219ae82921SPaul Mullowney PetscErrorCode ierr; 3229ae82921SPaul Mullowney 3239ae82921SPaul Mullowney PetscFunctionBegin; 324da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3259ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3269ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3279ae82921SPaul Mullowney PetscFunctionReturn(0); 3289ae82921SPaul Mullowney } 3299ae82921SPaul Mullowney 3306fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3319ae82921SPaul Mullowney { 332da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3339ae82921SPaul Mullowney PetscErrorCode ierr; 3349ae82921SPaul Mullowney 3359ae82921SPaul Mullowney PetscFunctionBegin; 336da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3379ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3389ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3399ae82921SPaul Mullowney PetscFunctionReturn(0); 3409ae82921SPaul Mullowney } 3419ae82921SPaul Mullowney 342087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 343087f3262SPaul Mullowney { 344da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 345087f3262SPaul Mullowney PetscErrorCode ierr; 346087f3262SPaul Mullowney 347087f3262SPaul Mullowney PetscFunctionBegin; 348da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 349087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 350087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 351087f3262SPaul Mullowney PetscFunctionReturn(0); 352087f3262SPaul Mullowney } 353087f3262SPaul Mullowney 354087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 355087f3262SPaul Mullowney { 356da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 357087f3262SPaul Mullowney PetscErrorCode ierr; 358087f3262SPaul Mullowney 359087f3262SPaul Mullowney PetscFunctionBegin; 360da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 361087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 362087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 363087f3262SPaul Mullowney PetscFunctionReturn(0); 364087f3262SPaul Mullowney } 365087f3262SPaul Mullowney 366087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3679ae82921SPaul Mullowney { 3689ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3699ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3709ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3729ae82921SPaul Mullowney cusparseStatus_t stat; 3739ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3749ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3759ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3769ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 377b175d8bbSPaul Mullowney PetscErrorCode ierr; 37857d48284SJunchao Zhang cudaError_t cerr; 3799ae82921SPaul Mullowney 3809ae82921SPaul Mullowney PetscFunctionBegin; 381cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 382c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3839ae82921SPaul Mullowney try { 3849ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3859ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 386da79fbbcSStefano Zampini if (!loTriFactor) { 3872cbc15d9SMark PetscScalar *AALo; 3882cbc15d9SMark 3892cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3909ae82921SPaul Mullowney 3919ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 39257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 39357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3969ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3979ae82921SPaul Mullowney AiLo[n] = nzLower; 3989ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3999ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4009ae82921SPaul Mullowney v = aa; 4019ae82921SPaul Mullowney vi = aj; 4029ae82921SPaul Mullowney offset = 1; 4039ae82921SPaul Mullowney rowOffset= 1; 4049ae82921SPaul Mullowney for (i=1; i<n; i++) { 4059ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 406e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4079ae82921SPaul Mullowney AiLo[i] = rowOffset; 4089ae82921SPaul Mullowney rowOffset += nz+1; 4099ae82921SPaul Mullowney 410580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 411580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4129ae82921SPaul Mullowney 4139ae82921SPaul Mullowney offset += nz; 4149ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4159ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4169ae82921SPaul Mullowney offset += 1; 4179ae82921SPaul Mullowney 4189ae82921SPaul Mullowney v += nz; 4199ae82921SPaul Mullowney vi += nz; 4209ae82921SPaul Mullowney } 4212205254eSKarl Rupp 422aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 423da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 424da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 425aa372e3fSPaul Mullowney /* Create the matrix description */ 42657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 42757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 429afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 430afb2bd1cSJunchao Zhang #else 43157d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 432afb2bd1cSJunchao Zhang #endif 43357d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 43457d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 435aa372e3fSPaul Mullowney 436aa372e3fSPaul Mullowney /* set the operation */ 437aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438aa372e3fSPaul Mullowney 439aa372e3fSPaul Mullowney /* set the matrix */ 440aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 441aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 442aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 443aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 444aa372e3fSPaul Mullowney 445aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 447aa372e3fSPaul Mullowney 448aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 449aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 450aa372e3fSPaul Mullowney 451aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 452aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 453aa372e3fSPaul Mullowney 454afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 455da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 456afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4571b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 459afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 460afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 461afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 462afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 463afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 464afb2bd1cSJunchao Zhang #endif 465afb2bd1cSJunchao Zhang 466aa372e3fSPaul Mullowney /* perform the solve analysis */ 467aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 468aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 469aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 470afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 4711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 472afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 473afb2bd1cSJunchao Zhang #endif 474afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 475da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 476da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 477aa372e3fSPaul Mullowney 478da79fbbcSStefano Zampini /* assign the pointer */ 479aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4802cbc15d9SMark loTriFactor->AA_h = AALo; 48157d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 48257d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4834863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484da79fbbcSStefano Zampini } else { /* update values only */ 4852cbc15d9SMark if (!loTriFactor->AA_h) { 4862cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4872cbc15d9SMark } 488da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4892cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 490da79fbbcSStefano Zampini v = aa; 491da79fbbcSStefano Zampini vi = aj; 492da79fbbcSStefano Zampini offset = 1; 493da79fbbcSStefano Zampini for (i=1; i<n; i++) { 494da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4952cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 496da79fbbcSStefano Zampini offset += nz; 4972cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 498da79fbbcSStefano Zampini offset += 1; 499da79fbbcSStefano Zampini v += nz; 500da79fbbcSStefano Zampini } 5012cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 502da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 503da79fbbcSStefano Zampini } 5049ae82921SPaul Mullowney } catch(char *ex) { 5059ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5069ae82921SPaul Mullowney } 5079ae82921SPaul Mullowney } 5089ae82921SPaul Mullowney PetscFunctionReturn(0); 5099ae82921SPaul Mullowney } 5109ae82921SPaul Mullowney 511087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5129ae82921SPaul Mullowney { 5139ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5149ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5159ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 516aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5179ae82921SPaul Mullowney cusparseStatus_t stat; 5189ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5199ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5209ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5219ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5229ae82921SPaul Mullowney PetscErrorCode ierr; 52357d48284SJunchao Zhang cudaError_t cerr; 5249ae82921SPaul Mullowney 5259ae82921SPaul Mullowney PetscFunctionBegin; 526cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 527c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5289ae82921SPaul Mullowney try { 5299ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5309ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 531da79fbbcSStefano Zampini if (!upTriFactor) { 5322cbc15d9SMark PetscScalar *AAUp; 5332cbc15d9SMark 5342cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5352cbc15d9SMark 5369ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 53757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 53857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5399ae82921SPaul Mullowney 5409ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5419ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5429ae82921SPaul Mullowney AiUp[n]=nzUpper; 5439ae82921SPaul Mullowney offset = nzUpper; 5449ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5459ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5469ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5479ae82921SPaul Mullowney 548e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5499ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5509ae82921SPaul Mullowney 551e057df02SPaul Mullowney /* decrement the offset */ 5529ae82921SPaul Mullowney offset -= (nz+1); 5539ae82921SPaul Mullowney 554e057df02SPaul Mullowney /* first, set the diagonal elements */ 5559ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 55609f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5579ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5589ae82921SPaul Mullowney 559580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 560580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5619ae82921SPaul Mullowney } 5622205254eSKarl Rupp 563aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 564da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 565da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5662205254eSKarl Rupp 567aa372e3fSPaul Mullowney /* Create the matrix description */ 56857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 56957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 572afb2bd1cSJunchao Zhang #else 57357d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 574afb2bd1cSJunchao Zhang #endif 57557d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 57657d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 577aa372e3fSPaul Mullowney 578aa372e3fSPaul Mullowney /* set the operation */ 579aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580aa372e3fSPaul Mullowney 581aa372e3fSPaul Mullowney /* set the matrix */ 582aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 583aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 584aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 585aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 586aa372e3fSPaul Mullowney 587aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589aa372e3fSPaul Mullowney 590aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592aa372e3fSPaul Mullowney 593aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595aa372e3fSPaul Mullowney 596afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 597da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 598afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 605afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 606afb2bd1cSJunchao Zhang #endif 607afb2bd1cSJunchao Zhang 608aa372e3fSPaul Mullowney /* perform the solve analysis */ 609aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 6131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 615afb2bd1cSJunchao Zhang #endif 616afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 617da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 618da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 619aa372e3fSPaul Mullowney 620da79fbbcSStefano Zampini /* assign the pointer */ 621aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6222cbc15d9SMark upTriFactor->AA_h = AAUp; 62357d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 62457d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6254863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 626da79fbbcSStefano Zampini } else { 6272cbc15d9SMark if (!upTriFactor->AA_h) { 6282cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6292cbc15d9SMark } 630da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 631da79fbbcSStefano Zampini offset = nzUpper; 632da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 633da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 634da79fbbcSStefano Zampini 635da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 636da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 637da79fbbcSStefano Zampini 638da79fbbcSStefano Zampini /* decrement the offset */ 639da79fbbcSStefano Zampini offset -= (nz+1); 640da79fbbcSStefano Zampini 641da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6422cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6432cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 644da79fbbcSStefano Zampini } 6452cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 646da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 647da79fbbcSStefano Zampini } 6489ae82921SPaul Mullowney } catch(char *ex) { 6499ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6509ae82921SPaul Mullowney } 6519ae82921SPaul Mullowney } 6529ae82921SPaul Mullowney PetscFunctionReturn(0); 6539ae82921SPaul Mullowney } 6549ae82921SPaul Mullowney 655087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6569ae82921SPaul Mullowney { 6579ae82921SPaul Mullowney PetscErrorCode ierr; 6589ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6599ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6609ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6619ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6629ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6639ae82921SPaul Mullowney 6649ae82921SPaul Mullowney PetscFunctionBegin; 665da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 666087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 667087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6682205254eSKarl Rupp 669da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 670aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6719ae82921SPaul Mullowney 672c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 673e057df02SPaul Mullowney /* lower triangular indices */ 6749ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 675da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 676da79fbbcSStefano Zampini const PetscInt *r; 677da79fbbcSStefano Zampini 678da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 679aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 680aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6819ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 682da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 683da79fbbcSStefano Zampini } 6849ae82921SPaul Mullowney 685e057df02SPaul Mullowney /* upper triangular indices */ 6869ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 688da79fbbcSStefano Zampini const PetscInt *c; 689da79fbbcSStefano Zampini 690da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 691aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 692aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6939ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 694da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 695da79fbbcSStefano Zampini } 6969ae82921SPaul Mullowney PetscFunctionReturn(0); 6979ae82921SPaul Mullowney } 6989ae82921SPaul Mullowney 699087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 700087f3262SPaul Mullowney { 701087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 703aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 704aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 705087f3262SPaul Mullowney cusparseStatus_t stat; 706087f3262SPaul Mullowney PetscErrorCode ierr; 70757d48284SJunchao Zhang cudaError_t cerr; 708087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 709087f3262SPaul Mullowney PetscScalar *AAUp; 710087f3262SPaul Mullowney PetscScalar *AALo; 711087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 712087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 713087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 714087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 715087f3262SPaul Mullowney 716087f3262SPaul Mullowney PetscFunctionBegin; 717cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 718c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 719087f3262SPaul Mullowney try { 720da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 722da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 723087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 72457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 72557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 726087f3262SPaul Mullowney 727087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 728087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 729087f3262SPaul Mullowney AiUp[n]=nzUpper; 730087f3262SPaul Mullowney offset = 0; 731087f3262SPaul Mullowney for (i=0; i<n; i++) { 732087f3262SPaul Mullowney /* set the pointers */ 733087f3262SPaul Mullowney v = aa + ai[i]; 734087f3262SPaul Mullowney vj = aj + ai[i]; 735087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 736087f3262SPaul Mullowney 737087f3262SPaul Mullowney /* first, set the diagonal elements */ 738087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 73909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 740087f3262SPaul Mullowney AiUp[i] = offset; 74109f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 742087f3262SPaul Mullowney 743087f3262SPaul Mullowney offset+=1; 744087f3262SPaul Mullowney if (nz>0) { 745f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 746580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 747087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 748087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 749087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 750087f3262SPaul Mullowney } 751087f3262SPaul Mullowney offset+=nz; 752087f3262SPaul Mullowney } 753087f3262SPaul Mullowney } 754087f3262SPaul Mullowney 755aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 756da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 757da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 758087f3262SPaul Mullowney 759aa372e3fSPaul Mullowney /* Create the matrix description */ 76057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 76157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 764afb2bd1cSJunchao Zhang #else 76557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 766afb2bd1cSJunchao Zhang #endif 76757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 76857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 769087f3262SPaul Mullowney 770aa372e3fSPaul Mullowney /* set the matrix */ 771aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 772aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 773aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 774aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 775aa372e3fSPaul Mullowney 776aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 777aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 778aa372e3fSPaul Mullowney 779aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 780aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 781aa372e3fSPaul Mullowney 782aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 783aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 784aa372e3fSPaul Mullowney 785afb2bd1cSJunchao Zhang /* set the operation */ 786afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 787afb2bd1cSJunchao Zhang 788afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 789da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 790afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7911b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 793afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 794afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 795afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 796afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 797afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 798afb2bd1cSJunchao Zhang #endif 799afb2bd1cSJunchao Zhang 800aa372e3fSPaul Mullowney /* perform the solve analysis */ 801aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 802aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 803aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 804afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 8051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 807afb2bd1cSJunchao Zhang #endif 808afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 809da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 810da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 811aa372e3fSPaul Mullowney 812da79fbbcSStefano Zampini /* assign the pointer */ 813aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814aa372e3fSPaul Mullowney 815aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 816da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 817da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818aa372e3fSPaul Mullowney 819aa372e3fSPaul Mullowney /* Create the matrix description */ 82057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 82157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 824afb2bd1cSJunchao Zhang #else 82557d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 826afb2bd1cSJunchao Zhang #endif 82757d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 82857d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 829aa372e3fSPaul Mullowney 830aa372e3fSPaul Mullowney /* set the operation */ 831aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832aa372e3fSPaul Mullowney 833aa372e3fSPaul Mullowney /* set the matrix */ 834aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 835aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 836aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 837aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 838aa372e3fSPaul Mullowney 839aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841aa372e3fSPaul Mullowney 842aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847aa372e3fSPaul Mullowney 848afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 849da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 850afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 857afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 858afb2bd1cSJunchao Zhang #endif 859afb2bd1cSJunchao Zhang 860aa372e3fSPaul Mullowney /* perform the solve analysis */ 861aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 8651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 867afb2bd1cSJunchao Zhang #endif 868afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 869da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 870da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 871aa372e3fSPaul Mullowney 872da79fbbcSStefano Zampini /* assign the pointer */ 873aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 874087f3262SPaul Mullowney 875da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 87657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 87757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 878da79fbbcSStefano Zampini } else { 879da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 880da79fbbcSStefano Zampini offset = 0; 881da79fbbcSStefano Zampini for (i=0; i<n; i++) { 882da79fbbcSStefano Zampini /* set the pointers */ 883da79fbbcSStefano Zampini v = aa + ai[i]; 884da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 885da79fbbcSStefano Zampini 886da79fbbcSStefano Zampini /* first, set the diagonal elements */ 887da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 888da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 889da79fbbcSStefano Zampini 890da79fbbcSStefano Zampini offset+=1; 891da79fbbcSStefano Zampini if (nz>0) { 892da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 893da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 894da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 895da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 896da79fbbcSStefano Zampini } 897da79fbbcSStefano Zampini offset+=nz; 898da79fbbcSStefano Zampini } 899da79fbbcSStefano Zampini } 900da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 902da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 903da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 904da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 905da79fbbcSStefano Zampini } 90657d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 90757d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 908087f3262SPaul Mullowney } catch(char *ex) { 909087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 910087f3262SPaul Mullowney } 911087f3262SPaul Mullowney } 912087f3262SPaul Mullowney PetscFunctionReturn(0); 913087f3262SPaul Mullowney } 914087f3262SPaul Mullowney 915087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9169ae82921SPaul Mullowney { 9179ae82921SPaul Mullowney PetscErrorCode ierr; 918087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 919087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 920087f3262SPaul Mullowney IS ip = a->row; 921087f3262SPaul Mullowney PetscBool perm_identity; 922087f3262SPaul Mullowney PetscInt n = A->rmap->n; 923087f3262SPaul Mullowney 924087f3262SPaul Mullowney PetscFunctionBegin; 925da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 926087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 927da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 928aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 929aa372e3fSPaul Mullowney 930da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 931da79fbbcSStefano Zampini 932087f3262SPaul Mullowney /* lower triangular indices */ 933087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 934087f3262SPaul Mullowney if (!perm_identity) { 9354e4bbfaaSStefano Zampini IS iip; 936da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9374e4bbfaaSStefano Zampini 9384e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9394e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 940da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 941aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 942aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 943aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9444e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9454e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9464e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 947087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 948da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 949da79fbbcSStefano Zampini } 950087f3262SPaul Mullowney PetscFunctionReturn(0); 951087f3262SPaul Mullowney } 952087f3262SPaul Mullowney 953087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 954087f3262SPaul Mullowney { 955087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 956087f3262SPaul Mullowney IS ip = b->row; 957087f3262SPaul Mullowney PetscBool perm_identity; 958b175d8bbSPaul Mullowney PetscErrorCode ierr; 959087f3262SPaul Mullowney 960087f3262SPaul Mullowney PetscFunctionBegin; 96157181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 962087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 963ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 964087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 965087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 966087f3262SPaul Mullowney if (perm_identity) { 967087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 968087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9694e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9704e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 971087f3262SPaul Mullowney } else { 972087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 973087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9744e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9754e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 976087f3262SPaul Mullowney } 977087f3262SPaul Mullowney 978087f3262SPaul Mullowney /* get the triangular factors */ 979087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 980087f3262SPaul Mullowney PetscFunctionReturn(0); 981087f3262SPaul Mullowney } 9829ae82921SPaul Mullowney 983b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 984bda325fcSPaul Mullowney { 985bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 986aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 987aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 988da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 989da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 990bda325fcSPaul Mullowney cusparseStatus_t stat; 991aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 992aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 993aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 994aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 9951b0a6780SStefano Zampini cudaError_t cerr; 996da79fbbcSStefano Zampini PetscErrorCode ierr; 997b175d8bbSPaul Mullowney 998bda325fcSPaul Mullowney PetscFunctionBegin; 999aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1000da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1001da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002aa372e3fSPaul Mullowney 1003aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1004aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1005aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009aa372e3fSPaul Mullowney 1010aa372e3fSPaul Mullowney /* Create the matrix description */ 101157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 101257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 101357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 101457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 101557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1016aa372e3fSPaul Mullowney 1017aa372e3fSPaul Mullowney /* set the operation */ 1018aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019aa372e3fSPaul Mullowney 1020aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1021aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1022afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028aa372e3fSPaul Mullowney 1029aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1031afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1034afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1035afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1036afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1037afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1039afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10401b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1041afb2bd1cSJunchao Zhang #endif 1042afb2bd1cSJunchao Zhang 1043da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1044aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1047aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1048aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1049aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1050afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1053afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1054afb2bd1cSJunchao Zhang #else 1055afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1056afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1057afb2bd1cSJunchao Zhang #endif 1058afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1059da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1060da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1061aa372e3fSPaul Mullowney 1062afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1063da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1064afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1070afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1071afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1072afb2bd1cSJunchao Zhang #endif 1073afb2bd1cSJunchao Zhang 1074afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1075aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 10791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1081afb2bd1cSJunchao Zhang #endif 1082afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1083da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1084da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1085aa372e3fSPaul Mullowney 1086da79fbbcSStefano Zampini /* assign the pointer */ 1087aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1088aa372e3fSPaul Mullowney 1089aa372e3fSPaul Mullowney /*********************************************/ 1090aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1091aa372e3fSPaul Mullowney /*********************************************/ 1092aa372e3fSPaul Mullowney 1093aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1094da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1095da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1096aa372e3fSPaul Mullowney 1097aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1098aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1099aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1100aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1101aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1102aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1103aa372e3fSPaul Mullowney 1104aa372e3fSPaul Mullowney /* Create the matrix description */ 110557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 110657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 110757d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 110857d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 110957d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1110aa372e3fSPaul Mullowney 1111aa372e3fSPaul Mullowney /* set the operation */ 1112aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1113aa372e3fSPaul Mullowney 1114aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1115aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1116afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1117afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1118aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1119afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1120afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1121afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1122aa372e3fSPaul Mullowney 1123aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1126afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1127afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1128afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1129afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1130afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1131afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1132afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1133afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1135afb2bd1cSJunchao Zhang #endif 1136afb2bd1cSJunchao Zhang 1137da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1138aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1139aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1140aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1141aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1142aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1143aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1144afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1145afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1147afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1148afb2bd1cSJunchao Zhang #else 1149afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1150afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1151afb2bd1cSJunchao Zhang #endif 1152afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1153da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1154da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1155aa372e3fSPaul Mullowney 1156afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1157da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1158afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1160afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1161afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1162afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1163afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1164afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1165afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1166afb2bd1cSJunchao Zhang #endif 1167afb2bd1cSJunchao Zhang 1168afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1169aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1170afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1171afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1172afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 11731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1175afb2bd1cSJunchao Zhang #endif 1176afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1177da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1178da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1179aa372e3fSPaul Mullowney 1180da79fbbcSStefano Zampini /* assign the pointer */ 1181aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1182bda325fcSPaul Mullowney PetscFunctionReturn(0); 1183bda325fcSPaul Mullowney } 1184bda325fcSPaul Mullowney 1185a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1186a49f1ed0SStefano Zampini { 1187a49f1ed0SStefano Zampini __host__ __device__ 1188a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1189a49f1ed0SStefano Zampini { 1190a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1191a49f1ed0SStefano Zampini } 1192a49f1ed0SStefano Zampini }; 1193a49f1ed0SStefano Zampini 11941a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1195bda325fcSPaul Mullowney { 1196aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1197a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1198bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1199bda325fcSPaul Mullowney cusparseStatus_t stat; 1200aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1201b06137fdSPaul Mullowney cudaError_t err; 120285ba7357SStefano Zampini PetscErrorCode ierr; 1203b175d8bbSPaul Mullowney 1204bda325fcSPaul Mullowney PetscFunctionBegin; 12051a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1206a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1207a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1208e8d2b73aSMark Adams if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1209a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1210e8d2b73aSMark Adams if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12111a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 121285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1213*ee7b52eaSHong Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1214a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1215a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1216a49f1ed0SStefano Zampini } 1217a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1218aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 121957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1220aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 122157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 122257d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1223aa372e3fSPaul Mullowney 1224b06137fdSPaul Mullowney /* set alpha and beta */ 1225afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12267656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12277656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1228afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12297656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12307656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1231b06137fdSPaul Mullowney 1232aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1233aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1234a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1235554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1236554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1237aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1238a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1239aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1240aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1241a3fdcf43SKarl Rupp 1242039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 124381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1244afb2bd1cSJunchao Zhang 1245afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1246afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1247afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1248afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1249afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1250afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1251afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1252afb2bd1cSJunchao Zhang #endif 1253aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1254afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1255afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1256afb2bd1cSJunchao Zhang #else 1257aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 125851c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 125951c6d536SStefano Zampini /* First convert HYB to CSR */ 1260aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1261aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1262aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1263aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1264aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1265aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1266aa372e3fSPaul Mullowney 1267aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1268aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1269aa372e3fSPaul Mullowney temp->values->data().get(), 1270aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 127157d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1272aa372e3fSPaul Mullowney 1273aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1274aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1275aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1276aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1277aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1278aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1279aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1280aa372e3fSPaul Mullowney 1281aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1282aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1283aa372e3fSPaul Mullowney temp->values->data().get(), 1284aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1285aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1286aa372e3fSPaul Mullowney tempT->values->data().get(), 1287aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1288aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 128957d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1290aa372e3fSPaul Mullowney 1291aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1292aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 129357d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1294aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1295aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1296aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1297aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1298aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1299aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 130057d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1301aa372e3fSPaul Mullowney 1302aa372e3fSPaul Mullowney /* assign the pointer */ 1303aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13041a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1305aa372e3fSPaul Mullowney /* delete temporaries */ 1306aa372e3fSPaul Mullowney if (tempT) { 1307aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1308aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1309aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1310aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1311087f3262SPaul Mullowney } 1312aa372e3fSPaul Mullowney if (temp) { 1313aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1314aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1315aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1316aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1317aa372e3fSPaul Mullowney } 1318afb2bd1cSJunchao Zhang #endif 1319aa372e3fSPaul Mullowney } 1320a49f1ed0SStefano Zampini } 1321a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1322a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1323a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1324e8d2b73aSMark Adams if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1325e8d2b73aSMark Adams if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1326e8d2b73aSMark Adams if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1327e8d2b73aSMark Adams if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1328e8d2b73aSMark Adams if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1329e8d2b73aSMark Adams if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1330e8d2b73aSMark Adams if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1331e8d2b73aSMark Adams if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1332a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1333a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1334a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1335a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1336a49f1ed0SStefano Zampini } 1337a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1338a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1339a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1340a49f1ed0SStefano Zampini 1341a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1342a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1343a49f1ed0SStefano Zampini void *csr2cscBuffer; 1344a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1345a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1346a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1347a49f1ed0SStefano Zampini matrix->values->data().get(), 1348a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1349a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1350a49f1ed0SStefano Zampini matrixT->values->data().get(), 1351a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1352a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1353a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1354a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1355a49f1ed0SStefano Zampini #endif 1356a49f1ed0SStefano Zampini 13571a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13581a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13591a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13601a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13611a2c6b5cSJunchao Zhang 13621a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13631a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13641a2c6b5cSJunchao Zhang */ 13651a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 13661a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 13671a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 13681a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 13691a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1370a49f1ed0SStefano Zampini matrixT->values->data().get(), 1371a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1372a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1373a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13741a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1375a49f1ed0SStefano Zampini #else 1376a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 13771a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1378a49f1ed0SStefano Zampini #endif 13791a2c6b5cSJunchao Zhang } else { 13801a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13811a2c6b5cSJunchao Zhang } 13821a2c6b5cSJunchao Zhang 1383a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1385a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1386a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1387a49f1ed0SStefano Zampini #endif 1388a49f1ed0SStefano Zampini } 1389a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1390a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1391a49f1ed0SStefano Zampini matrixT->values->begin())); 1392a49f1ed0SStefano Zampini } 1393*ee7b52eaSHong Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 139485ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1395213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1396213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1397aa372e3fSPaul Mullowney /* assign the pointer */ 1398aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 13991a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1400bda325fcSPaul Mullowney PetscFunctionReturn(0); 1401bda325fcSPaul Mullowney } 1402bda325fcSPaul Mullowney 1403a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14046fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1405bda325fcSPaul Mullowney { 1406c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1407465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1408465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1409465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1410465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1411bda325fcSPaul Mullowney cusparseStatus_t stat; 1412bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416b175d8bbSPaul Mullowney PetscErrorCode ierr; 1417bda325fcSPaul Mullowney 1418bda325fcSPaul Mullowney PetscFunctionBegin; 1419aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1420aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1421bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1422aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1423aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1424bda325fcSPaul Mullowney } 1425bda325fcSPaul Mullowney 1426bda325fcSPaul Mullowney /* Get the GPU pointers */ 1427c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1428c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1429c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1430c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1431bda325fcSPaul Mullowney 14327a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1433aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1434a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1435c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1436c41cb2e2SAlejandro Lamas Daviña xGPU); 1437aa372e3fSPaul Mullowney 1438aa372e3fSPaul Mullowney /* First, solve U */ 1439aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1440afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1442afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1443afb2bd1cSJunchao Zhang #endif 1444afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1445aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1446aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1447aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1448aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1449afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 14501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1451afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1452afb2bd1cSJunchao Zhang #endif 1453afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1454aa372e3fSPaul Mullowney 1455aa372e3fSPaul Mullowney /* Then, solve L */ 1456aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1457afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1459afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1460afb2bd1cSJunchao Zhang #endif 1461afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1462aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1463aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1464aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1465aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1466afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1468afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1469afb2bd1cSJunchao Zhang #endif 1470afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1471aa372e3fSPaul Mullowney 1472aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1473a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1474c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1475aa372e3fSPaul Mullowney tempGPU->begin()); 1476aa372e3fSPaul Mullowney 1477aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1478a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1479bda325fcSPaul Mullowney 1480bda325fcSPaul Mullowney /* restore */ 1481c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1482c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1483661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1484958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1485bda325fcSPaul Mullowney PetscFunctionReturn(0); 1486bda325fcSPaul Mullowney } 1487bda325fcSPaul Mullowney 14886fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1489bda325fcSPaul Mullowney { 1490465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1491465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1492bda325fcSPaul Mullowney cusparseStatus_t stat; 1493bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1494aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1495aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1496aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1497b175d8bbSPaul Mullowney PetscErrorCode ierr; 1498bda325fcSPaul Mullowney 1499bda325fcSPaul Mullowney PetscFunctionBegin; 1500aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1501aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1502bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1503aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1504aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1505bda325fcSPaul Mullowney } 1506bda325fcSPaul Mullowney 1507bda325fcSPaul Mullowney /* Get the GPU pointers */ 1508c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1509c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1510bda325fcSPaul Mullowney 15117a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1512aa372e3fSPaul Mullowney /* First, solve U */ 1513aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1514afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1516afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1517afb2bd1cSJunchao Zhang #endif 1518afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1519aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1520aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1521aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1522aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1523afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 15241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1526afb2bd1cSJunchao Zhang #endif 1527afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1528aa372e3fSPaul Mullowney 1529aa372e3fSPaul Mullowney /* Then, solve L */ 1530aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1531afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1534afb2bd1cSJunchao Zhang #endif 1535afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1536aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1537aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1538aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1539aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1540afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 15411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1542afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1543afb2bd1cSJunchao Zhang #endif 1544afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1545bda325fcSPaul Mullowney 1546bda325fcSPaul Mullowney /* restore */ 1547c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1548c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1549661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1550958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1551bda325fcSPaul Mullowney PetscFunctionReturn(0); 1552bda325fcSPaul Mullowney } 1553bda325fcSPaul Mullowney 15546fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15559ae82921SPaul Mullowney { 1556465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1557465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1558465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1559465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15609ae82921SPaul Mullowney cusparseStatus_t stat; 15619ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1562aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1563aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1564aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1565b175d8bbSPaul Mullowney PetscErrorCode ierr; 15669ae82921SPaul Mullowney 15679ae82921SPaul Mullowney PetscFunctionBegin; 1568ebc8f436SDominic Meiser 1569e057df02SPaul Mullowney /* Get the GPU pointers */ 1570c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1571c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1572c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1573c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15749ae82921SPaul Mullowney 15757a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1576aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1577a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1578c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15794e4bbfaaSStefano Zampini tempGPU->begin()); 1580aa372e3fSPaul Mullowney 1581aa372e3fSPaul Mullowney /* Next, solve L */ 1582aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1583afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1585afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1586afb2bd1cSJunchao Zhang #endif 1587afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1588aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1589aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1590aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1591aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1592afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 15931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1594afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1595afb2bd1cSJunchao Zhang #endif 1596afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1597aa372e3fSPaul Mullowney 1598aa372e3fSPaul Mullowney /* Then, solve U */ 1599aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1600afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16011b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1602afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1603afb2bd1cSJunchao Zhang #endif 1604afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1605aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1606aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1607aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1608aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1609afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 16101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1612afb2bd1cSJunchao Zhang #endif 1613afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1614aa372e3fSPaul Mullowney 16154e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1616a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16174e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16184e4bbfaaSStefano Zampini xGPU); 16199ae82921SPaul Mullowney 1620c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1621c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1622661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1623958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16249ae82921SPaul Mullowney PetscFunctionReturn(0); 16259ae82921SPaul Mullowney } 16269ae82921SPaul Mullowney 16276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16289ae82921SPaul Mullowney { 1629465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1630465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16319ae82921SPaul Mullowney cusparseStatus_t stat; 16329ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1633aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1634aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1635aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1636b175d8bbSPaul Mullowney PetscErrorCode ierr; 16379ae82921SPaul Mullowney 16389ae82921SPaul Mullowney PetscFunctionBegin; 1639e057df02SPaul Mullowney /* Get the GPU pointers */ 1640c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1641c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 16429ae82921SPaul Mullowney 16437a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1644aa372e3fSPaul Mullowney /* First, solve L */ 1645aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1646afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16471b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1648afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1649afb2bd1cSJunchao Zhang #endif 1650afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1651aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1652aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1653aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1654aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1655afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 16561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1657afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1658afb2bd1cSJunchao Zhang #endif 1659afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1660aa372e3fSPaul Mullowney 1661aa372e3fSPaul Mullowney /* Next, solve U */ 1662aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1663afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1665afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1666afb2bd1cSJunchao Zhang #endif 1667afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1668aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1669aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1670aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1671aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1672afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 16731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1674afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1675afb2bd1cSJunchao Zhang #endif 1676afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 16779ae82921SPaul Mullowney 1678c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1679c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1680661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1681958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16829ae82921SPaul Mullowney PetscFunctionReturn(0); 16839ae82921SPaul Mullowney } 16849ae82921SPaul Mullowney 16857e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 16867e8381f9SStefano Zampini { 16877e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16887e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 16897e8381f9SStefano Zampini cudaError_t cerr; 16907e8381f9SStefano Zampini PetscErrorCode ierr; 16917e8381f9SStefano Zampini 16927e8381f9SStefano Zampini PetscFunctionBegin; 16937e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 16947e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 16957e8381f9SStefano Zampini 16967e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16977e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 16987e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 16997e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 17007e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17017e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17027e8381f9SStefano Zampini } 17037e8381f9SStefano Zampini PetscFunctionReturn(0); 17047e8381f9SStefano Zampini } 17057e8381f9SStefano Zampini 17067e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17077e8381f9SStefano Zampini { 17087e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17097e8381f9SStefano Zampini PetscErrorCode ierr; 17107e8381f9SStefano Zampini 17117e8381f9SStefano Zampini PetscFunctionBegin; 17127e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 17137e8381f9SStefano Zampini *array = a->a; 17147e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 17157e8381f9SStefano Zampini PetscFunctionReturn(0); 17167e8381f9SStefano Zampini } 17177e8381f9SStefano Zampini 17186fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 17199ae82921SPaul Mullowney { 1720aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 17217c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 17229ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1723213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 17249ae82921SPaul Mullowney PetscErrorCode ierr; 1725aa372e3fSPaul Mullowney cusparseStatus_t stat; 1726abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1727b06137fdSPaul Mullowney cudaError_t err; 17289ae82921SPaul Mullowney 17299ae82921SPaul Mullowney PetscFunctionBegin; 1730e8d2b73aSMark Adams if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1731c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1732a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1733a49f1ed0SStefano Zampini CsrMatrix *matrix; 1734afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 173585ba7357SStefano Zampini 1736e8d2b73aSMark Adams if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 173785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1738afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 173905035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 17404863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 174185ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1742a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 174334d6c7a5SJose E. Roman } else { 1744abb89eb1SStefano Zampini PetscInt nnz; 174585ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17467c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1747a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 17487c700b8dSJunchao Zhang delete cusparsestruct->workVector; 174981902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1750a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1751a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 17529ae82921SPaul Mullowney try { 17539ae82921SPaul Mullowney if (a->compressedrow.use) { 17549ae82921SPaul Mullowney m = a->compressedrow.nrows; 17559ae82921SPaul Mullowney ii = a->compressedrow.i; 17569ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 17579ae82921SPaul Mullowney } else { 1758213423ffSJunchao Zhang m = A->rmap->n; 1759213423ffSJunchao Zhang ii = a->i; 1760e6e9a74fSStefano Zampini ridx = NULL; 17619ae82921SPaul Mullowney } 1762e8d2b73aSMark Adams if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1763e8d2b73aSMark Adams if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1764abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1765abb89eb1SStefano Zampini else nnz = a->nz; 17669ae82921SPaul Mullowney 176785ba7357SStefano Zampini /* create cusparse matrix */ 1768abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1769aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 177057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 177157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 177257d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 17739ae82921SPaul Mullowney 1774afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 17757656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 17767656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1777afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17787656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17797656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 178057d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1781b06137fdSPaul Mullowney 1782aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1783aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1784aa372e3fSPaul Mullowney /* set the matrix */ 1785afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1786afb2bd1cSJunchao Zhang mat->num_rows = m; 1787afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1788abb89eb1SStefano Zampini mat->num_entries = nnz; 1789afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1790afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 17919ae82921SPaul Mullowney 1792abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1793abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1794aa372e3fSPaul Mullowney 1795abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1796abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1797aa372e3fSPaul Mullowney 1798aa372e3fSPaul Mullowney /* assign the pointer */ 1799afb2bd1cSJunchao Zhang matstruct->mat = mat; 1800afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1801afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1802afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1803afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1804afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1805afb2bd1cSJunchao Zhang mat->values->data().get(), 1806afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1807afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1808afb2bd1cSJunchao Zhang } 1809afb2bd1cSJunchao Zhang #endif 1810aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1811afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1812afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1813afb2bd1cSJunchao Zhang #else 1814afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1815afb2bd1cSJunchao Zhang mat->num_rows = m; 1816afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1817abb89eb1SStefano Zampini mat->num_entries = nnz; 1818afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1819afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1820aa372e3fSPaul Mullowney 1821abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1822abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1823aa372e3fSPaul Mullowney 1824abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1825abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1826aa372e3fSPaul Mullowney 1827aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 182857d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1829aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1830aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1831afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1832afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1833afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1834afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 183557d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1836aa372e3fSPaul Mullowney /* assign the pointer */ 1837aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1838aa372e3fSPaul Mullowney 1839afb2bd1cSJunchao Zhang if (mat) { 1840afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1841afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1842afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1843afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1844087f3262SPaul Mullowney } 1845afb2bd1cSJunchao Zhang #endif 1846087f3262SPaul Mullowney } 1847ca45077fSPaul Mullowney 1848aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1849213423ffSJunchao Zhang if (a->compressedrow.use) { 1850213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1851aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1852aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1853213423ffSJunchao Zhang tmp = m; 1854213423ffSJunchao Zhang } else { 1855213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1856213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1857213423ffSJunchao Zhang tmp = 0; 1858213423ffSJunchao Zhang } 1859213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1860aa372e3fSPaul Mullowney 1861aa372e3fSPaul Mullowney /* assign the pointer */ 1862aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 18639ae82921SPaul Mullowney } catch(char *ex) { 18649ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18659ae82921SPaul Mullowney } 186605035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 186785ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 186834d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 186934d6c7a5SJose E. Roman } 1870abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 18719ae82921SPaul Mullowney } 18729ae82921SPaul Mullowney PetscFunctionReturn(0); 18739ae82921SPaul Mullowney } 18749ae82921SPaul Mullowney 1875c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1876aa372e3fSPaul Mullowney { 1877aa372e3fSPaul Mullowney template <typename Tuple> 1878aa372e3fSPaul Mullowney __host__ __device__ 1879aa372e3fSPaul Mullowney void operator()(Tuple t) 1880aa372e3fSPaul Mullowney { 1881aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1882aa372e3fSPaul Mullowney } 1883aa372e3fSPaul Mullowney }; 1884aa372e3fSPaul Mullowney 18857e8381f9SStefano Zampini struct VecCUDAEquals 18867e8381f9SStefano Zampini { 18877e8381f9SStefano Zampini template <typename Tuple> 18887e8381f9SStefano Zampini __host__ __device__ 18897e8381f9SStefano Zampini void operator()(Tuple t) 18907e8381f9SStefano Zampini { 18917e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 18927e8381f9SStefano Zampini } 18937e8381f9SStefano Zampini }; 18947e8381f9SStefano Zampini 1895e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1896e6e9a74fSStefano Zampini { 1897e6e9a74fSStefano Zampini template <typename Tuple> 1898e6e9a74fSStefano Zampini __host__ __device__ 1899e6e9a74fSStefano Zampini void operator()(Tuple t) 1900e6e9a74fSStefano Zampini { 1901e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1902e6e9a74fSStefano Zampini } 1903e6e9a74fSStefano Zampini }; 1904e6e9a74fSStefano Zampini 1905afb2bd1cSJunchao Zhang struct MatMatCusparse { 1906ccdfe979SStefano Zampini PetscBool cisdense; 1907ccdfe979SStefano Zampini PetscScalar *Bt; 1908ccdfe979SStefano Zampini Mat X; 1909fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1910fcdce8c4SStefano Zampini PetscLogDouble flops; 1911fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 1912afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1913fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 1914afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1915afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1916afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1917afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1918fcdce8c4SStefano Zampini size_t mmBufferSize; 1919fcdce8c4SStefano Zampini void *mmBuffer; 1920fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1921fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 1922afb2bd1cSJunchao Zhang #endif 1923afb2bd1cSJunchao Zhang }; 1924ccdfe979SStefano Zampini 1925ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1926ccdfe979SStefano Zampini { 1927ccdfe979SStefano Zampini PetscErrorCode ierr; 1928ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1929ccdfe979SStefano Zampini cudaError_t cerr; 1930fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1931fcdce8c4SStefano Zampini cusparseStatus_t stat; 1932fcdce8c4SStefano Zampini #endif 1933ccdfe979SStefano Zampini 1934ccdfe979SStefano Zampini PetscFunctionBegin; 1935ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1936fcdce8c4SStefano Zampini delete mmdata->Bcsr; 1937afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1938fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1939fcdce8c4SStefano Zampini if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1940fcdce8c4SStefano Zampini if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1941afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1942afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1943fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1944afb2bd1cSJunchao Zhang #endif 1945ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1946ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1947ccdfe979SStefano Zampini PetscFunctionReturn(0); 1948ccdfe979SStefano Zampini } 1949ccdfe979SStefano Zampini 1950ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1951ccdfe979SStefano Zampini 1952ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1953ccdfe979SStefano Zampini { 1954ccdfe979SStefano Zampini Mat_Product *product = C->product; 1955ccdfe979SStefano Zampini Mat A,B; 1956afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1957ccdfe979SStefano Zampini PetscBool flg,biscuda; 1958ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1959ccdfe979SStefano Zampini cusparseStatus_t stat; 1960ccdfe979SStefano Zampini cusparseOperation_t opA; 1961ccdfe979SStefano Zampini const PetscScalar *barray; 1962ccdfe979SStefano Zampini PetscScalar *carray; 1963ccdfe979SStefano Zampini PetscErrorCode ierr; 1964ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1965ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1966ccdfe979SStefano Zampini CsrMatrix *csrmat; 1967ccdfe979SStefano Zampini 1968ccdfe979SStefano Zampini PetscFunctionBegin; 1969ccdfe979SStefano Zampini MatCheckProduct(C,1); 1970e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 1971ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1972ccdfe979SStefano Zampini A = product->A; 1973ccdfe979SStefano Zampini B = product->B; 1974ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1975e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 1976ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1977ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1978ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1979ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1980ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1981ccdfe979SStefano Zampini switch (product->type) { 1982ccdfe979SStefano Zampini case MATPRODUCT_AB: 1983ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1984ccdfe979SStefano Zampini mat = cusp->mat; 1985ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1986ccdfe979SStefano Zampini m = A->rmap->n; 1987ccdfe979SStefano Zampini n = B->cmap->n; 1988ccdfe979SStefano Zampini break; 1989ccdfe979SStefano Zampini case MATPRODUCT_AtB: 19901a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 1991e6e9a74fSStefano Zampini mat = cusp->mat; 1992e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1993e6e9a74fSStefano Zampini } else { 19941a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 1995ccdfe979SStefano Zampini mat = cusp->matTranspose; 1996ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1997e6e9a74fSStefano Zampini } 1998ccdfe979SStefano Zampini m = A->cmap->n; 1999ccdfe979SStefano Zampini n = B->cmap->n; 2000ccdfe979SStefano Zampini break; 2001ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2002ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2003ccdfe979SStefano Zampini mat = cusp->mat; 2004ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2005ccdfe979SStefano Zampini m = A->rmap->n; 2006ccdfe979SStefano Zampini n = B->rmap->n; 2007ccdfe979SStefano Zampini break; 2008ccdfe979SStefano Zampini default: 2009e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2010ccdfe979SStefano Zampini } 2011e8d2b73aSMark Adams if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2012ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2013ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2014ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2015afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2016ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2017afb2bd1cSJunchao Zhang 2018ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2019c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2020c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2021c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2022c8378d12SStefano Zampini } else { 2023c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2024c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2025c8378d12SStefano Zampini } 2026c8378d12SStefano Zampini 2027c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2028afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2029afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2030fcdce8c4SStefano Zampini /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2031afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2032fcdce8c4SStefano Zampini size_t mmBufferSize; 2033afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2034afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2035afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2036afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2037afb2bd1cSJunchao Zhang } 2038c8378d12SStefano Zampini 2039afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2040afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2041afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2042afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2043afb2bd1cSJunchao Zhang } 2044afb2bd1cSJunchao Zhang 2045afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2046afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2047afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2048afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2049afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2050afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2051afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2052afb2bd1cSJunchao Zhang } 2053afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2054afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2055afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2056fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2057fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2058*ee7b52eaSHong Zhang cudaError_t cerr; 2059fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2060fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2061fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2062fcdce8c4SStefano Zampini } 2063afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2064afb2bd1cSJunchao Zhang } else { 2065afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2066afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2067afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2068afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2069afb2bd1cSJunchao Zhang } 2070afb2bd1cSJunchao Zhang 2071afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2072afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2073afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2074afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2075fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2076afb2bd1cSJunchao Zhang #else 2077afb2bd1cSJunchao Zhang PetscInt k; 2078afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2079ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2080ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2081ccdfe979SStefano Zampini cublasStatus_t cerr; 2082ccdfe979SStefano Zampini 2083ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2084ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2085ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2086ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2087ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2088ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2089ccdfe979SStefano Zampini blda = B->cmap->n; 2090afb2bd1cSJunchao Zhang k = B->cmap->n; 2091afb2bd1cSJunchao Zhang } else { 2092afb2bd1cSJunchao Zhang k = B->rmap->n; 2093ccdfe979SStefano Zampini } 2094ccdfe979SStefano Zampini 2095afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2096ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2097afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2098ccdfe979SStefano Zampini csrmat->values->data().get(), 2099ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2100ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2101ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2102ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2103afb2bd1cSJunchao Zhang #endif 2104c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2105c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2106ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2107ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2108ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2109ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2110ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2111ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2112ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2113ccdfe979SStefano Zampini } else { 2114ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2115ccdfe979SStefano Zampini } 2116ccdfe979SStefano Zampini if (mmdata->cisdense) { 2117ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2118ccdfe979SStefano Zampini } 2119ccdfe979SStefano Zampini if (!biscuda) { 2120ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2121ccdfe979SStefano Zampini } 2122ccdfe979SStefano Zampini PetscFunctionReturn(0); 2123ccdfe979SStefano Zampini } 2124ccdfe979SStefano Zampini 2125ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2126ccdfe979SStefano Zampini { 2127ccdfe979SStefano Zampini Mat_Product *product = C->product; 2128ccdfe979SStefano Zampini Mat A,B; 2129ccdfe979SStefano Zampini PetscInt m,n; 2130ccdfe979SStefano Zampini PetscBool cisdense,flg; 2131ccdfe979SStefano Zampini PetscErrorCode ierr; 2132ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2133ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2134ccdfe979SStefano Zampini 2135ccdfe979SStefano Zampini PetscFunctionBegin; 2136ccdfe979SStefano Zampini MatCheckProduct(C,1); 2137e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2138ccdfe979SStefano Zampini A = product->A; 2139ccdfe979SStefano Zampini B = product->B; 2140ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2141e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2142ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2143e8d2b73aSMark Adams if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2144ccdfe979SStefano Zampini switch (product->type) { 2145ccdfe979SStefano Zampini case MATPRODUCT_AB: 2146ccdfe979SStefano Zampini m = A->rmap->n; 2147ccdfe979SStefano Zampini n = B->cmap->n; 2148ccdfe979SStefano Zampini break; 2149ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2150ccdfe979SStefano Zampini m = A->cmap->n; 2151ccdfe979SStefano Zampini n = B->cmap->n; 2152ccdfe979SStefano Zampini break; 2153ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2154ccdfe979SStefano Zampini m = A->rmap->n; 2155ccdfe979SStefano Zampini n = B->rmap->n; 2156ccdfe979SStefano Zampini break; 2157ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2158ccdfe979SStefano Zampini m = B->cmap->n; 2159ccdfe979SStefano Zampini n = B->cmap->n; 2160ccdfe979SStefano Zampini break; 2161ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2162ccdfe979SStefano Zampini m = B->rmap->n; 2163ccdfe979SStefano Zampini n = B->rmap->n; 2164ccdfe979SStefano Zampini break; 2165ccdfe979SStefano Zampini default: 2166e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2167ccdfe979SStefano Zampini } 2168ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2169ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2170ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2171ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2172ccdfe979SStefano Zampini 2173ccdfe979SStefano Zampini /* product data */ 2174ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2175ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2176afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2177afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2178ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2179afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2180ccdfe979SStefano Zampini } 2181afb2bd1cSJunchao Zhang #endif 2182ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2183ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2184ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2185ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2186ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2187ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2188ccdfe979SStefano Zampini } else { 2189ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2190ccdfe979SStefano Zampini } 2191ccdfe979SStefano Zampini } 2192ccdfe979SStefano Zampini C->product->data = mmdata; 2193ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2194ccdfe979SStefano Zampini 2195ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2196ccdfe979SStefano Zampini PetscFunctionReturn(0); 2197ccdfe979SStefano Zampini } 2198ccdfe979SStefano Zampini 2199fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2200ccdfe979SStefano Zampini { 2201ccdfe979SStefano Zampini Mat_Product *product = C->product; 2202fcdce8c4SStefano Zampini Mat A,B; 2203fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2204fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2205fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2206fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2207fcdce8c4SStefano Zampini PetscBool flg; 2208ccdfe979SStefano Zampini PetscErrorCode ierr; 2209fcdce8c4SStefano Zampini cusparseStatus_t stat; 2210fcdce8c4SStefano Zampini cudaError_t cerr; 2211fcdce8c4SStefano Zampini MatProductType ptype; 2212fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2213fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2214fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2215fcdce8c4SStefano Zampini #endif 2216ccdfe979SStefano Zampini 2217ccdfe979SStefano Zampini PetscFunctionBegin; 2218ccdfe979SStefano Zampini MatCheckProduct(C,1); 2219e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2220fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2221e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2222fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2223fcdce8c4SStefano Zampini A = product->A; 2224fcdce8c4SStefano Zampini B = product->B; 2225fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2226fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2227fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2228e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2229fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2230e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2231fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2232e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2233fcdce8c4SStefano Zampini goto finalize; 2234fcdce8c4SStefano Zampini } 2235fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2236fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2237e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2238fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2239e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2240fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2241fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2242fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2243fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2244fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2245e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2246e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2247e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2248fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2249fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2250fcdce8c4SStefano Zampini 2251fcdce8c4SStefano Zampini ptype = product->type; 2252fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2253fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2254fcdce8c4SStefano Zampini switch (ptype) { 2255fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2256fcdce8c4SStefano Zampini Amat = Acusp->mat; 2257fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2258fcdce8c4SStefano Zampini break; 2259fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2260fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2261fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2262fcdce8c4SStefano Zampini break; 2263fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2264fcdce8c4SStefano Zampini Amat = Acusp->mat; 2265fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2266fcdce8c4SStefano Zampini break; 2267fcdce8c4SStefano Zampini default: 2268e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2269fcdce8c4SStefano Zampini } 2270fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2271e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2272e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2273e8d2b73aSMark Adams if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2274fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2275fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2276fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2277e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2278e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2279e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2280fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2281fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2282fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2283fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2284fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2285fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2286fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2287fcdce8c4SStefano Zampini stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2288fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2289fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2290fcdce8c4SStefano Zampini #else 2291fcdce8c4SStefano Zampini stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2292fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2293fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2294fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2295fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2296fcdce8c4SStefano Zampini #endif 2297fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2298fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2299fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2300fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2301fcdce8c4SStefano Zampini finalize: 2302fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2303fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2304fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2305fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2306fcdce8c4SStefano Zampini c->reallocs = 0; 2307fcdce8c4SStefano Zampini C->info.mallocs += 0; 2308fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2309fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2310fcdce8c4SStefano Zampini C->num_ass++; 2311ccdfe979SStefano Zampini PetscFunctionReturn(0); 2312ccdfe979SStefano Zampini } 2313fcdce8c4SStefano Zampini 2314fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2315fcdce8c4SStefano Zampini { 2316fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2317fcdce8c4SStefano Zampini Mat A,B; 2318fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2319fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2320fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2321fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2322fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2323fcdce8c4SStefano Zampini PetscBool flg; 2324fcdce8c4SStefano Zampini PetscErrorCode ierr; 2325fcdce8c4SStefano Zampini cusparseStatus_t stat; 2326fcdce8c4SStefano Zampini cudaError_t cerr; 2327fcdce8c4SStefano Zampini MatProductType ptype; 2328fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2329fcdce8c4SStefano Zampini PetscLogDouble flops; 2330fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2331fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2332fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2333fcdce8c4SStefano Zampini size_t bufSize2; 2334fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2335fcdce8c4SStefano Zampini #else 2336fcdce8c4SStefano Zampini int cnz; 2337fcdce8c4SStefano Zampini #endif 2338fcdce8c4SStefano Zampini 2339fcdce8c4SStefano Zampini PetscFunctionBegin; 2340fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2341e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2342fcdce8c4SStefano Zampini A = product->A; 2343fcdce8c4SStefano Zampini B = product->B; 2344fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2345e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2346fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2347e8d2b73aSMark Adams if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2348fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2349fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2350fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2351fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2352e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2353e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2354fcdce8c4SStefano Zampini 2355fcdce8c4SStefano Zampini /* product data */ 2356fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2357fcdce8c4SStefano Zampini C->product->data = mmdata; 2358fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2359fcdce8c4SStefano Zampini 2360fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2361fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2362fcdce8c4SStefano Zampini ptype = product->type; 2363fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2364fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2365fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2366fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2367fcdce8c4SStefano Zampini switch (ptype) { 2368fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2369fcdce8c4SStefano Zampini m = A->rmap->n; 2370fcdce8c4SStefano Zampini n = B->cmap->n; 2371fcdce8c4SStefano Zampini k = A->cmap->n; 2372fcdce8c4SStefano Zampini Amat = Acusp->mat; 2373fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2374fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2375fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2376fcdce8c4SStefano Zampini break; 2377fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2378fcdce8c4SStefano Zampini m = A->cmap->n; 2379fcdce8c4SStefano Zampini n = B->cmap->n; 2380fcdce8c4SStefano Zampini k = A->rmap->n; 23811a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2382fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2383fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2384fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2385fcdce8c4SStefano Zampini break; 2386fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2387fcdce8c4SStefano Zampini m = A->rmap->n; 2388fcdce8c4SStefano Zampini n = B->rmap->n; 2389fcdce8c4SStefano Zampini k = A->cmap->n; 23901a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2391fcdce8c4SStefano Zampini Amat = Acusp->mat; 2392fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2393fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2394fcdce8c4SStefano Zampini break; 2395fcdce8c4SStefano Zampini default: 2396e8d2b73aSMark Adams SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2397fcdce8c4SStefano Zampini } 2398fcdce8c4SStefano Zampini 2399fcdce8c4SStefano Zampini /* create cusparse matrix */ 2400fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2401fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2402fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2403fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2404fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2405fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2406fcdce8c4SStefano Zampini 2407fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2408fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2409fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2410fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2411fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2412fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2413fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2414fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2415fcdce8c4SStefano Zampini } else { 2416fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2417fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2418fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2419fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2420fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2421fcdce8c4SStefano Zampini } 2422fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2423fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2424fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2425fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2426fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2427fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2428fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2429fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2430fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2431fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2432fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2433fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2434fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2435fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2436fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2437fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2438fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2439fcdce8c4SStefano Zampini c->nz = 0; 2440fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2441fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2442fcdce8c4SStefano Zampini goto finalizesym; 2443fcdce8c4SStefano Zampini } 2444fcdce8c4SStefano Zampini 2445e8d2b73aSMark Adams if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2446e8d2b73aSMark Adams if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2447fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2448fcdce8c4SStefano Zampini if (!biscompressed) { 2449fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2450fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2451fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2452fcdce8c4SStefano Zampini #endif 2453fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2454fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2455fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2456fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2457fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2458fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2459fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2460fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2461fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2462fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2463fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2464fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2465fcdce8c4SStefano Zampini } 2466fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2467fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2468fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2469fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2470fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2471fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2472fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2473fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2474fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2475fcdce8c4SStefano Zampini } 2476fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2477fcdce8c4SStefano Zampini #endif 2478fcdce8c4SStefano Zampini } 2479e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2480e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2481fcdce8c4SStefano Zampini /* precompute flops count */ 2482fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2483fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2484fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2485fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2486fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2487fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2488fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2489fcdce8c4SStefano Zampini } 2490fcdce8c4SStefano Zampini } 2491fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2492fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2493fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2494fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2495fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2496fcdce8c4SStefano Zampini } 2497fcdce8c4SStefano Zampini } else { /* TODO */ 2498fcdce8c4SStefano Zampini flops = 0.; 2499fcdce8c4SStefano Zampini } 2500fcdce8c4SStefano Zampini 2501fcdce8c4SStefano Zampini mmdata->flops = flops; 2502fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2503fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2504fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2505fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2506fcdce8c4SStefano Zampini NULL, NULL, NULL, 2507fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2508fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2509fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2510fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2511fcdce8c4SStefano Zampini stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2512fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2513fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2514fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2515bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2516fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2517fcdce8c4SStefano Zampini stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2518fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2519fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2520fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2521fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2522fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2523fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2524fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2525fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2526fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2527fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2528fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2529fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2530fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2531bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2532fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2533fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2534fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2535fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2536fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2537fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2538fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2539fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 254000702c57SStefano Zampini ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2541fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2542fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2543fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2544fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2545fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2546fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2547fcdce8c4SStefano Zampini stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2548fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2549fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2550fcdce8c4SStefano Zampini #else 2551fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2552fcdce8c4SStefano Zampini stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2553fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2554fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2555fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2556fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2557fcdce8c4SStefano Zampini c->nz = cnz; 2558fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2559fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2560fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2561fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2562fcdce8c4SStefano Zampini 2563fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2564fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2565fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2566fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2567fcdce8c4SStefano Zampini stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2568fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2569fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2570fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2571fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2572fcdce8c4SStefano Zampini #endif 2573fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2574fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2575fcdce8c4SStefano Zampini finalizesym: 2576fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2577fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2578fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2579fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2580fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2581fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2582fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2583fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2584fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2585fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2586fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2587fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2588fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2589fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2590fcdce8c4SStefano Zampini } else { 2591fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2592fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2593fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2594fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2595fcdce8c4SStefano Zampini } 2596fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2597fcdce8c4SStefano Zampini PetscInt r = 0; 2598fcdce8c4SStefano Zampini c->i[0] = 0; 2599fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2600fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2601fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2602fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2603fcdce8c4SStefano Zampini } 2604fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2605fcdce8c4SStefano Zampini } 2606fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2607fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2608fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2609fcdce8c4SStefano Zampini c->maxnz = c->nz; 2610fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2611fcdce8c4SStefano Zampini c->rmax = 0; 2612fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2613fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2614fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2615fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2616fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2617fcdce8c4SStefano Zampini } 2618fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2619fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2620fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2621fcdce8c4SStefano Zampini 2622fcdce8c4SStefano Zampini C->nonzerostate++; 2623fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2624fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2625fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2626fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2627fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2628fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2629fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2630abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2631fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2632fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2633fcdce8c4SStefano Zampini } 2634fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2635fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2636fcdce8c4SStefano Zampini } 2637fcdce8c4SStefano Zampini 2638fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2639fcdce8c4SStefano Zampini 2640fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2641fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2642fcdce8c4SStefano Zampini { 2643fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2644fcdce8c4SStefano Zampini PetscErrorCode ierr; 2645fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2646fcdce8c4SStefano Zampini 2647fcdce8c4SStefano Zampini PetscFunctionBegin; 2648fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2649fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2650abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2651fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2652fcdce8c4SStefano Zampini } 2653fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2654fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2655fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2656fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2657fcdce8c4SStefano Zampini } 2658fcdce8c4SStefano Zampini } 2659fcdce8c4SStefano Zampini if (isdense) { 2660ccdfe979SStefano Zampini switch (product->type) { 2661ccdfe979SStefano Zampini case MATPRODUCT_AB: 2662ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2663ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2664ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2665ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2666fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2667fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2668fcdce8c4SStefano Zampini } else { 2669fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2670fcdce8c4SStefano Zampini } 2671fcdce8c4SStefano Zampini break; 2672fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2673fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2674fcdce8c4SStefano Zampini break; 2675ccdfe979SStefano Zampini default: 2676ccdfe979SStefano Zampini break; 2677ccdfe979SStefano Zampini } 2678fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2679fcdce8c4SStefano Zampini switch (product->type) { 2680fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2681fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2682fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2683fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2684fcdce8c4SStefano Zampini break; 2685fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2686fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2687fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2688fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2689fcdce8c4SStefano Zampini break; 2690fcdce8c4SStefano Zampini default: 2691fcdce8c4SStefano Zampini break; 2692fcdce8c4SStefano Zampini } 2693fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 2694fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2695fcdce8c4SStefano Zampini } 2696ccdfe979SStefano Zampini PetscFunctionReturn(0); 2697ccdfe979SStefano Zampini } 2698ccdfe979SStefano Zampini 26996fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 27009ae82921SPaul Mullowney { 2701b175d8bbSPaul Mullowney PetscErrorCode ierr; 27029ae82921SPaul Mullowney 27039ae82921SPaul Mullowney PetscFunctionBegin; 2704e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2705e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2706e6e9a74fSStefano Zampini } 2707e6e9a74fSStefano Zampini 2708e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2709e6e9a74fSStefano Zampini { 2710e6e9a74fSStefano Zampini PetscErrorCode ierr; 2711e6e9a74fSStefano Zampini 2712e6e9a74fSStefano Zampini PetscFunctionBegin; 2713e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2714e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2715e6e9a74fSStefano Zampini } 2716e6e9a74fSStefano Zampini 2717e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2718e6e9a74fSStefano Zampini { 2719e6e9a74fSStefano Zampini PetscErrorCode ierr; 2720e6e9a74fSStefano Zampini 2721e6e9a74fSStefano Zampini PetscFunctionBegin; 2722e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2723e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2724e6e9a74fSStefano Zampini } 2725e6e9a74fSStefano Zampini 2726e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2727e6e9a74fSStefano Zampini { 2728e6e9a74fSStefano Zampini PetscErrorCode ierr; 2729e6e9a74fSStefano Zampini 2730e6e9a74fSStefano Zampini PetscFunctionBegin; 2731e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 27329ae82921SPaul Mullowney PetscFunctionReturn(0); 27339ae82921SPaul Mullowney } 27349ae82921SPaul Mullowney 27356fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2736ca45077fSPaul Mullowney { 2737b175d8bbSPaul Mullowney PetscErrorCode ierr; 2738ca45077fSPaul Mullowney 2739ca45077fSPaul Mullowney PetscFunctionBegin; 2740e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2741ca45077fSPaul Mullowney PetscFunctionReturn(0); 2742ca45077fSPaul Mullowney } 2743ca45077fSPaul Mullowney 2744a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2745a0e72f99SJunchao Zhang { 2746a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2747a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2748a0e72f99SJunchao Zhang } 2749a0e72f99SJunchao Zhang 2750afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2751e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 27529ae82921SPaul Mullowney { 27539ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2754aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 27559ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2756e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2757b175d8bbSPaul Mullowney PetscErrorCode ierr; 2758aa372e3fSPaul Mullowney cusparseStatus_t stat; 2759e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2760e6e9a74fSStefano Zampini PetscBool compressed; 2761afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2762afb2bd1cSJunchao Zhang PetscInt nx,ny; 2763afb2bd1cSJunchao Zhang #endif 27646e111a19SKarl Rupp 27659ae82921SPaul Mullowney PetscFunctionBegin; 2766e8d2b73aSMark Adams if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2767e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2768afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2769d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2770e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2771e6e9a74fSStefano Zampini } 277234d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 277334d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2774e6e9a74fSStefano Zampini if (!trans) { 27759ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2776e8d2b73aSMark Adams if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2777e6e9a74fSStefano Zampini } else { 27781a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 2779e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2780e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2781e6e9a74fSStefano Zampini } else { 27821a2c6b5cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2783e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2784e6e9a74fSStefano Zampini } 2785e6e9a74fSStefano Zampini } 2786e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2787e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2788213423ffSJunchao Zhang 2789e6e9a74fSStefano Zampini try { 2790e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2791213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2792213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2793afb2bd1cSJunchao Zhang 279485ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2795e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2796afb2bd1cSJunchao Zhang /* z = A x + beta y. 2797afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2798afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2799afb2bd1cSJunchao Zhang */ 2800e6e9a74fSStefano Zampini xptr = xarray; 2801afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2802213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2803afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2804afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2805afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2806afb2bd1cSJunchao Zhang */ 2807afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2808afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2809afb2bd1cSJunchao Zhang nx = mat->num_cols; 2810afb2bd1cSJunchao Zhang ny = mat->num_rows; 2811afb2bd1cSJunchao Zhang } 2812afb2bd1cSJunchao Zhang #endif 2813e6e9a74fSStefano Zampini } else { 2814afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2815afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2816afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2817afb2bd1cSJunchao Zhang */ 2818afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2819e6e9a74fSStefano Zampini dptr = zarray; 2820e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2821afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2822e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2823a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2824e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2825e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2826e6e9a74fSStefano Zampini } 2827afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2828afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2829afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2830afb2bd1cSJunchao Zhang nx = mat->num_rows; 2831afb2bd1cSJunchao Zhang ny = mat->num_cols; 2832afb2bd1cSJunchao Zhang } 2833afb2bd1cSJunchao Zhang #endif 2834e6e9a74fSStefano Zampini } 28359ae82921SPaul Mullowney 2836afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2837aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2838afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2839afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2840afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2841*ee7b52eaSHong Zhang cudaError_t cerr; 2842afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2843afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2844afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2845afb2bd1cSJunchao Zhang matstruct->matDescr, 2846afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2847afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2848afb2bd1cSJunchao Zhang cusparse_scalartype, 2849afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2850afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2851afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2852afb2bd1cSJunchao Zhang 2853afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2854afb2bd1cSJunchao Zhang } else { 2855afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2856afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2857afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2858afb2bd1cSJunchao Zhang } 2859afb2bd1cSJunchao Zhang 2860afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2861afb2bd1cSJunchao Zhang matstruct->alpha_one, 28621a2c6b5cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2863afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2864afb2bd1cSJunchao Zhang beta, 2865afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2866afb2bd1cSJunchao Zhang cusparse_scalartype, 2867afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2868afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2869afb2bd1cSJunchao Zhang #else 28707656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2871e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2872a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2873afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2874aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2875e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 287657d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2877afb2bd1cSJunchao Zhang #endif 2878aa372e3fSPaul Mullowney } else { 2879213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2880afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2881afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2882afb2bd1cSJunchao Zhang #else 2883301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2884e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2885afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2886e6e9a74fSStefano Zampini xptr, beta, 288757d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2888afb2bd1cSJunchao Zhang #endif 2889a65300a6SPaul Mullowney } 2890aa372e3fSPaul Mullowney } 2891958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2892aa372e3fSPaul Mullowney 2893e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2894213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2895213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2896213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2897e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2898213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 28997656d835SStefano Zampini } 2900213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2901c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 29027656d835SStefano Zampini } 29037656d835SStefano Zampini 2904213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2905213423ffSJunchao Zhang if (compressed) { 2906e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2907a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2908a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2909a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 2910a0e72f99SJunchao Zhang */ 2911a0e72f99SJunchao Zhang #if 0 2912a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2913a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2914a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2915e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2916c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 2917a0e72f99SJunchao Zhang #else 2918a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 2919a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2920a0e72f99SJunchao Zhang #endif 2921958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2922e6e9a74fSStefano Zampini } 2923e6e9a74fSStefano Zampini } else { 2924e6e9a74fSStefano Zampini if (yy && yy != zz) { 2925e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2926e6e9a74fSStefano Zampini } 2927e6e9a74fSStefano Zampini } 2928e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2929213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2930213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 29319ae82921SPaul Mullowney } catch(char *ex) { 29329ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 29339ae82921SPaul Mullowney } 2934e6e9a74fSStefano Zampini if (yy) { 2935958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2936e6e9a74fSStefano Zampini } else { 2937e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2938e6e9a74fSStefano Zampini } 29399ae82921SPaul Mullowney PetscFunctionReturn(0); 29409ae82921SPaul Mullowney } 29419ae82921SPaul Mullowney 29426fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2943ca45077fSPaul Mullowney { 2944b175d8bbSPaul Mullowney PetscErrorCode ierr; 29456e111a19SKarl Rupp 2946ca45077fSPaul Mullowney PetscFunctionBegin; 2947e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2948ca45077fSPaul Mullowney PetscFunctionReturn(0); 2949ca45077fSPaul Mullowney } 2950ca45077fSPaul Mullowney 29516fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 29529ae82921SPaul Mullowney { 29539ae82921SPaul Mullowney PetscErrorCode ierr; 2954a587d139SMark PetscSplitCSRDataStructure *d_mat = NULL; 29559ae82921SPaul Mullowney PetscFunctionBegin; 2956bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 29573fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2958bc3f50f2SPaul Mullowney } 29593fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 29603fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2961a587d139SMark if (d_mat) { 29623fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 29633fa6b06aSMark Adams } 29643fa6b06aSMark Adams 29659ae82921SPaul Mullowney PetscFunctionReturn(0); 29669ae82921SPaul Mullowney } 29679ae82921SPaul Mullowney 29689ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2969e057df02SPaul Mullowney /*@ 29709ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2971e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2972e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2973e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2974e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2975e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 29769ae82921SPaul Mullowney 2977d083f849SBarry Smith Collective 29789ae82921SPaul Mullowney 29799ae82921SPaul Mullowney Input Parameters: 29809ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 29819ae82921SPaul Mullowney . m - number of rows 29829ae82921SPaul Mullowney . n - number of columns 29839ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 29849ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 29850298fd71SBarry Smith (possibly different for each row) or NULL 29869ae82921SPaul Mullowney 29879ae82921SPaul Mullowney Output Parameter: 29889ae82921SPaul Mullowney . A - the matrix 29899ae82921SPaul Mullowney 29909ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 29919ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 29929ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 29939ae82921SPaul Mullowney 29949ae82921SPaul Mullowney Notes: 29959ae82921SPaul Mullowney If nnz is given then nz is ignored 29969ae82921SPaul Mullowney 29979ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 29989ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 29999ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 30009ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 30019ae82921SPaul Mullowney 30029ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 30030298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 30049ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 30059ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 30069ae82921SPaul Mullowney 30079ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 30089ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 30099ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 30109ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 30119ae82921SPaul Mullowney 30129ae82921SPaul Mullowney Level: intermediate 30139ae82921SPaul Mullowney 3014e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 30159ae82921SPaul Mullowney @*/ 30169ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 30179ae82921SPaul Mullowney { 30189ae82921SPaul Mullowney PetscErrorCode ierr; 30199ae82921SPaul Mullowney 30209ae82921SPaul Mullowney PetscFunctionBegin; 30219ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 30229ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 30239ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 30249ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 30259ae82921SPaul Mullowney PetscFunctionReturn(0); 30269ae82921SPaul Mullowney } 30279ae82921SPaul Mullowney 30286fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 30299ae82921SPaul Mullowney { 30309ae82921SPaul Mullowney PetscErrorCode ierr; 30313fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 3032ab25e6cbSDominic Meiser 30339ae82921SPaul Mullowney PetscFunctionBegin; 30349ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 30353fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 30363fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3037470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 30389ae82921SPaul Mullowney } else { 3039470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3040aa372e3fSPaul Mullowney } 30413fa6b06aSMark Adams if (d_mat) { 30423fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30433fa6b06aSMark Adams cudaError_t err; 30443fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 30453fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 30463fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 30473fa6b06aSMark Adams if (a->compressedrow.use) { 30483fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 30493fa6b06aSMark Adams } 30503fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 30513fa6b06aSMark Adams } 3052c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3053ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3054ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3055ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3056fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3057ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 30587e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 30597e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 30609ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 30619ae82921SPaul Mullowney PetscFunctionReturn(0); 30629ae82921SPaul Mullowney } 30639ae82921SPaul Mullowney 3064ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 306595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 30669ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 30679ff858a8SKarl Rupp { 30689ff858a8SKarl Rupp PetscErrorCode ierr; 30699ff858a8SKarl Rupp 30709ff858a8SKarl Rupp PetscFunctionBegin; 30719ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3072ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 30739ff858a8SKarl Rupp PetscFunctionReturn(0); 30749ff858a8SKarl Rupp } 30759ff858a8SKarl Rupp 3076039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 307795639643SRichard Tran Mills { 3078e6e9a74fSStefano Zampini PetscErrorCode ierr; 3079a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3080039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3081039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3082039c6fbaSStefano Zampini PetscScalar *ay; 3083039c6fbaSStefano Zampini const PetscScalar *ax; 3084039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3085e6e9a74fSStefano Zampini 308695639643SRichard Tran Mills PetscFunctionBegin; 3087a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3088a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3089039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3090a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3091a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3092a587d139SMark PetscFunctionReturn(0); 309395639643SRichard Tran Mills } 3094039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3095a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3096a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3097e8d2b73aSMark Adams if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3098e8d2b73aSMark Adams if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3099039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3100039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3101039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3102039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3103039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3104039c6fbaSStefano Zampini if (eq) { 3105039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3106039c6fbaSStefano Zampini } 3107039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3108039c6fbaSStefano Zampini } 3109d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3110d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3111039c6fbaSStefano Zampini 3112039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3113039c6fbaSStefano Zampini cusparseStatus_t stat; 3114039c6fbaSStefano Zampini PetscScalar b = 1.0; 3115039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3116039c6fbaSStefano Zampini size_t bufferSize; 3117039c6fbaSStefano Zampini void *buffer; 3118*ee7b52eaSHong Zhang cudaError_t cerr; 3119039c6fbaSStefano Zampini #endif 3120039c6fbaSStefano Zampini 3121039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3122039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3123039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3124039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3125039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3126039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3127039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3128039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3129039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3130039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3131039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3132039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3133039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3134039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3135039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3136039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3137039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3138039c6fbaSStefano Zampini #else 3139039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3140039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3141039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3142039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3143039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3144039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3145039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3146039c6fbaSStefano Zampini #endif 3147039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3148039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3149039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3150039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3151039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3152a587d139SMark cublasHandle_t cublasv2handle; 3153039c6fbaSStefano Zampini cublasStatus_t berr; 3154a587d139SMark PetscBLASInt one = 1, bnz = 1; 3155039c6fbaSStefano Zampini 3156039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3157039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3158a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3159a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3160a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3161039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3162a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3163a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3164039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3165039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3166a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3167039c6fbaSStefano Zampini } else { 3168a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3169d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3170a587d139SMark } 317195639643SRichard Tran Mills PetscFunctionReturn(0); 317295639643SRichard Tran Mills } 317395639643SRichard Tran Mills 317433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 317533c9ba73SStefano Zampini { 317633c9ba73SStefano Zampini PetscErrorCode ierr; 317733c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 317833c9ba73SStefano Zampini PetscScalar *ay; 317933c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 318033c9ba73SStefano Zampini cublasStatus_t berr; 318133c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 318233c9ba73SStefano Zampini 318333c9ba73SStefano Zampini PetscFunctionBegin; 318433c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 318533c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 318633c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 318733c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 318833c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 318933c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 319033c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 319133c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 319233c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 319333c9ba73SStefano Zampini PetscFunctionReturn(0); 319433c9ba73SStefano Zampini } 319533c9ba73SStefano Zampini 31963fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 31973fa6b06aSMark Adams { 31983fa6b06aSMark Adams PetscErrorCode ierr; 31997e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3200a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 32017e8381f9SStefano Zampini 32023fa6b06aSMark Adams PetscFunctionBegin; 32033fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 32043fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 32057e8381f9SStefano Zampini if (spptr->mat) { 32067e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 32077e8381f9SStefano Zampini if (matrix->values) { 32087e8381f9SStefano Zampini both = PETSC_TRUE; 32097e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 32107e8381f9SStefano Zampini } 32117e8381f9SStefano Zampini } 32127e8381f9SStefano Zampini if (spptr->matTranspose) { 32137e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 32147e8381f9SStefano Zampini if (matrix->values) { 32157e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 32167e8381f9SStefano Zampini } 32177e8381f9SStefano Zampini } 32183fa6b06aSMark Adams } 3219a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3220a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3221a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 32227e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3223a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 32243fa6b06aSMark Adams 32253fa6b06aSMark Adams PetscFunctionReturn(0); 32263fa6b06aSMark Adams } 32273fa6b06aSMark Adams 3228a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3229a587d139SMark { 3230a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3231a587d139SMark PetscErrorCode ierr; 3232a587d139SMark 3233a587d139SMark PetscFunctionBegin; 3234a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3235a587d139SMark if (flg) { 3236a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3237a587d139SMark 323833c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3239a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3240a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3241a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3242a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3243a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3244a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3245a587d139SMark A->ops->multhermitiantranspose = NULL; 3246a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3247fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3248c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3249a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3250a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3251a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3252a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3253a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3254fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3255a587d139SMark } else { 325633c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3257a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3258a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3259a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3260a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3261a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3262a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3263a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3264a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3265fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3266c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3267a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3268a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3269a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3270a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3271a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3272fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3273a587d139SMark } 3274a587d139SMark A->boundtocpu = flg; 3275a587d139SMark a->inode.use = flg; 3276a587d139SMark PetscFunctionReturn(0); 3277a587d139SMark } 3278a587d139SMark 327949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 32809ae82921SPaul Mullowney { 32819ae82921SPaul Mullowney PetscErrorCode ierr; 3282aa372e3fSPaul Mullowney cusparseStatus_t stat; 328349735bf3SStefano Zampini Mat B; 32849ae82921SPaul Mullowney 32859ae82921SPaul Mullowney PetscFunctionBegin; 3286832b2c02SStefano Zampini ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 328749735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 328849735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 328949735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 329049735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 329149735bf3SStefano Zampini } 329249735bf3SStefano Zampini B = *newmat; 329349735bf3SStefano Zampini 329434136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 329534136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 329634136279SStefano Zampini 329749735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 32989ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3299e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3300e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3301e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3302a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 33031a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3304d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3305d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3306d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3307d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3308d8132acaSStefano Zampini #endif 33091a2c6b5cSJunchao Zhang B->spptr = spptr; 33109ae82921SPaul Mullowney } else { 3311e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3312e6e9a74fSStefano Zampini 3313e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3314e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3315a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3316e6e9a74fSStefano Zampini B->spptr = spptr; 33179ae82921SPaul Mullowney } 3318e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 331949735bf3SStefano Zampini } 3320693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 33219ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 33221a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 33239ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 332495639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3325693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 33262205254eSKarl Rupp 3327e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 33289ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3329bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 33309ae82921SPaul Mullowney PetscFunctionReturn(0); 33319ae82921SPaul Mullowney } 33329ae82921SPaul Mullowney 333302fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 333402fe1965SBarry Smith { 333502fe1965SBarry Smith PetscErrorCode ierr; 333602fe1965SBarry Smith 333702fe1965SBarry Smith PetscFunctionBegin; 333802fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 33390ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 334002fe1965SBarry Smith PetscFunctionReturn(0); 334102fe1965SBarry Smith } 334202fe1965SBarry Smith 33433ca39a21SBarry Smith /*MC 3344e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3345e057df02SPaul Mullowney 3346e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 33472692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 33482692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3349e057df02SPaul Mullowney 3350e057df02SPaul Mullowney Options Database Keys: 3351e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3352aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3353a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3354e057df02SPaul Mullowney 3355e057df02SPaul Mullowney Level: beginner 3356e057df02SPaul Mullowney 33578468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3358e057df02SPaul Mullowney M*/ 33597f756511SDominic Meiser 3360bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 33610f39cd5aSBarry Smith 33623ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 336342c9c57cSBarry Smith { 336442c9c57cSBarry Smith PetscErrorCode ierr; 336542c9c57cSBarry Smith 336642c9c57cSBarry Smith PetscFunctionBegin; 3367bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 33683ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 33693ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 33703ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 33713ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3372bddcd29dSMark Adams 337342c9c57cSBarry Smith PetscFunctionReturn(0); 337442c9c57cSBarry Smith } 337529b38603SBarry Smith 3376470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 33777f756511SDominic Meiser { 3378e6e9a74fSStefano Zampini PetscErrorCode ierr; 33797f756511SDominic Meiser cusparseStatus_t stat; 33807f756511SDominic Meiser 33817f756511SDominic Meiser PetscFunctionBegin; 33827f756511SDominic Meiser if (*cusparsestruct) { 3383e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3384e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 33857f756511SDominic Meiser delete (*cusparsestruct)->workVector; 338681902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 33877e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 33887e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3389a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 33907e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3391e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 33927f756511SDominic Meiser } 33937f756511SDominic Meiser PetscFunctionReturn(0); 33947f756511SDominic Meiser } 33957f756511SDominic Meiser 33967f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 33977f756511SDominic Meiser { 33987f756511SDominic Meiser PetscFunctionBegin; 33997f756511SDominic Meiser if (*mat) { 34007f756511SDominic Meiser delete (*mat)->values; 34017f756511SDominic Meiser delete (*mat)->column_indices; 34027f756511SDominic Meiser delete (*mat)->row_offsets; 34037f756511SDominic Meiser delete *mat; 34047f756511SDominic Meiser *mat = 0; 34057f756511SDominic Meiser } 34067f756511SDominic Meiser PetscFunctionReturn(0); 34077f756511SDominic Meiser } 34087f756511SDominic Meiser 3409470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 34107f756511SDominic Meiser { 34117f756511SDominic Meiser cusparseStatus_t stat; 34127f756511SDominic Meiser PetscErrorCode ierr; 34137f756511SDominic Meiser 34147f756511SDominic Meiser PetscFunctionBegin; 34157f756511SDominic Meiser if (*trifactor) { 341657d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3417afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 34187f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 34191b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 34202cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3421afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 34221b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3423afb2bd1cSJunchao Zhang #endif 3424da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 34257f756511SDominic Meiser } 34267f756511SDominic Meiser PetscFunctionReturn(0); 34277f756511SDominic Meiser } 34287f756511SDominic Meiser 3429470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 34307f756511SDominic Meiser { 34317f756511SDominic Meiser CsrMatrix *mat; 34327f756511SDominic Meiser cusparseStatus_t stat; 34337f756511SDominic Meiser cudaError_t err; 34347f756511SDominic Meiser 34357f756511SDominic Meiser PetscFunctionBegin; 34367f756511SDominic Meiser if (*matstruct) { 34377f756511SDominic Meiser if ((*matstruct)->mat) { 34387f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3439afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3440afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3441afb2bd1cSJunchao Zhang #else 34427f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 344357d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3444afb2bd1cSJunchao Zhang #endif 34457f756511SDominic Meiser } else { 34467f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 34477f756511SDominic Meiser CsrMatrix_Destroy(&mat); 34487f756511SDominic Meiser } 34497f756511SDominic Meiser } 345057d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 34517f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3452afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 34537656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 34547656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3455afb2bd1cSJunchao Zhang 3456afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3457afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3458afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3459afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3460afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3461afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3462afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3463afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3464afb2bd1cSJunchao Zhang } 3465afb2bd1cSJunchao Zhang } 3466afb2bd1cSJunchao Zhang #endif 34677f756511SDominic Meiser delete *matstruct; 34687e8381f9SStefano Zampini *matstruct = NULL; 34697f756511SDominic Meiser } 34707f756511SDominic Meiser PetscFunctionReturn(0); 34717f756511SDominic Meiser } 34727f756511SDominic Meiser 3473e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 34747f756511SDominic Meiser { 3475e6e9a74fSStefano Zampini PetscErrorCode ierr; 3476e6e9a74fSStefano Zampini 34777f756511SDominic Meiser PetscFunctionBegin; 34787f756511SDominic Meiser if (*trifactors) { 3479e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3480e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3481e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3482e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 34837f756511SDominic Meiser delete (*trifactors)->rpermIndices; 34847f756511SDominic Meiser delete (*trifactors)->cpermIndices; 34857f756511SDominic Meiser delete (*trifactors)->workVector; 34867e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 34877e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 34887e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3489bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3490bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3491e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3492ccdfe979SStefano Zampini } 3493ccdfe979SStefano Zampini PetscFunctionReturn(0); 3494ccdfe979SStefano Zampini } 3495ccdfe979SStefano Zampini 3496ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3497ccdfe979SStefano Zampini { 3498e6e9a74fSStefano Zampini PetscErrorCode ierr; 3499ccdfe979SStefano Zampini cusparseHandle_t handle; 3500ccdfe979SStefano Zampini cusparseStatus_t stat; 3501ccdfe979SStefano Zampini 3502ccdfe979SStefano Zampini PetscFunctionBegin; 3503ccdfe979SStefano Zampini if (*trifactors) { 3504e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 35057f756511SDominic Meiser if (handle = (*trifactors)->handle) { 350657d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 35077f756511SDominic Meiser } 3508e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 35097f756511SDominic Meiser } 35107f756511SDominic Meiser PetscFunctionReturn(0); 35117f756511SDominic Meiser } 35127e8381f9SStefano Zampini 35137e8381f9SStefano Zampini struct IJCompare 35147e8381f9SStefano Zampini { 35157e8381f9SStefano Zampini __host__ __device__ 35167e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 35177e8381f9SStefano Zampini { 35187e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 35197e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 35207e8381f9SStefano Zampini return false; 35217e8381f9SStefano Zampini } 35227e8381f9SStefano Zampini }; 35237e8381f9SStefano Zampini 35247e8381f9SStefano Zampini struct IJEqual 35257e8381f9SStefano Zampini { 35267e8381f9SStefano Zampini __host__ __device__ 35277e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 35287e8381f9SStefano Zampini { 35297e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 35307e8381f9SStefano Zampini return true; 35317e8381f9SStefano Zampini } 35327e8381f9SStefano Zampini }; 35337e8381f9SStefano Zampini 35347e8381f9SStefano Zampini struct IJDiff 35357e8381f9SStefano Zampini { 35367e8381f9SStefano Zampini __host__ __device__ 35377e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 35387e8381f9SStefano Zampini { 35397e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 35407e8381f9SStefano Zampini } 35417e8381f9SStefano Zampini }; 35427e8381f9SStefano Zampini 35437e8381f9SStefano Zampini struct IJSum 35447e8381f9SStefano Zampini { 35457e8381f9SStefano Zampini __host__ __device__ 35467e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 35477e8381f9SStefano Zampini { 35487e8381f9SStefano Zampini return t1||t2; 35497e8381f9SStefano Zampini } 35507e8381f9SStefano Zampini }; 35517e8381f9SStefano Zampini 35527e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3553e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 35547e8381f9SStefano Zampini { 35557e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3556fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3557bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 355808391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 35597e8381f9SStefano Zampini CsrMatrix *matrix; 35607e8381f9SStefano Zampini PetscErrorCode ierr; 35617e8381f9SStefano Zampini PetscInt n; 35627e8381f9SStefano Zampini 35637e8381f9SStefano Zampini PetscFunctionBegin; 35647e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 35657e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 35667e8381f9SStefano Zampini if (!cusp->cooPerm) { 35677e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 35687e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 35697e8381f9SStefano Zampini PetscFunctionReturn(0); 35707e8381f9SStefano Zampini } 35717e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 35727e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3573e61fc153SStefano Zampini if (!v) { 3574e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3575e61fc153SStefano Zampini goto finalize; 35767e8381f9SStefano Zampini } 3577e61fc153SStefano Zampini n = cusp->cooPerm->size(); 357808391a17SStefano Zampini if (isCudaMem(v)) { 357908391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 358008391a17SStefano Zampini } else { 3581e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3582e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 358308391a17SStefano Zampini d_v = cooPerm_v->data(); 3584e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 358508391a17SStefano Zampini } 3586bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3587e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 35887e8381f9SStefano Zampini if (cusp->cooPerm_a) { 3589bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 359008391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3591e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3592e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3593e61fc153SStefano Zampini delete cooPerm_w; 35947e8381f9SStefano Zampini } else { 359508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 35967e8381f9SStefano Zampini matrix->values->begin())); 359708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 35987e8381f9SStefano Zampini matrix->values->end())); 35997e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 36007e8381f9SStefano Zampini } 36017e8381f9SStefano Zampini } else { 3602e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 360308391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3604e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 36057e8381f9SStefano Zampini } else { 360608391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 36077e8381f9SStefano Zampini matrix->values->begin())); 360808391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 36097e8381f9SStefano Zampini matrix->values->end())); 36107e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 36117e8381f9SStefano Zampini } 36127e8381f9SStefano Zampini } 3613bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3614e61fc153SStefano Zampini finalize: 3615e61fc153SStefano Zampini delete cooPerm_v; 36167e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3617e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3618fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3619fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3620fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3621fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3622fcdce8c4SStefano Zampini a->reallocs = 0; 3623fcdce8c4SStefano Zampini A->info.mallocs += 0; 3624fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3625fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3626fcdce8c4SStefano Zampini A->num_ass++; 36277e8381f9SStefano Zampini PetscFunctionReturn(0); 36287e8381f9SStefano Zampini } 36297e8381f9SStefano Zampini 3630a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3631a49f1ed0SStefano Zampini { 3632a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3633a49f1ed0SStefano Zampini PetscErrorCode ierr; 3634a49f1ed0SStefano Zampini 3635a49f1ed0SStefano Zampini PetscFunctionBegin; 3636a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3637a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3638a49f1ed0SStefano Zampini if (destroy) { 3639a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3640a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3641a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3642a49f1ed0SStefano Zampini } 36431a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3644a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3645a49f1ed0SStefano Zampini } 3646a49f1ed0SStefano Zampini 36477e8381f9SStefano Zampini #include <thrust/binary_search.h> 3648e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 36497e8381f9SStefano Zampini { 36507e8381f9SStefano Zampini PetscErrorCode ierr; 36517e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 36527e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 36537e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 36547e8381f9SStefano Zampini cudaError_t cerr; 36557e8381f9SStefano Zampini 36567e8381f9SStefano Zampini PetscFunctionBegin; 36577e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 36587e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 36597e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 36607e8381f9SStefano Zampini if (n != cooPerm_n) { 36617e8381f9SStefano Zampini delete cusp->cooPerm; 36627e8381f9SStefano Zampini delete cusp->cooPerm_a; 36637e8381f9SStefano Zampini cusp->cooPerm = NULL; 36647e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 36657e8381f9SStefano Zampini } 36667e8381f9SStefano Zampini if (n) { 36677e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 36687e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 36697e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 36707e8381f9SStefano Zampini 36717e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 36727e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 36737e8381f9SStefano Zampini 36747e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 36757e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 36767e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 36777e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 36787e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 36797e8381f9SStefano Zampini 368008391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 36817e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 36827e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 36837e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 36847e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 36857e8381f9SStefano Zampini 36867e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 36877e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 36887e8381f9SStefano Zampini delete cusp->cooPerm_a; 36897e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 36907e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 36917e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 36927e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 36937e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 36947e8381f9SStefano Zampini w[0] = 0; 36957e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 36967e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 36977e8381f9SStefano Zampini } 36987e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 36997e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 37007e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 37017e8381f9SStefano Zampini ii.begin()); 370208391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 37037e8381f9SStefano Zampini 37047e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 37057e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 37067e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 37077e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 37087e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 37097e8381f9SStefano Zampini a->i[0] = 0; 37107e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 37117e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3712fcdce8c4SStefano Zampini a->rmax = 0; 37137e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 37147e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 37157e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 37167e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 37177e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 37187e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 37197e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 37207e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 37217e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3722fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 37237e8381f9SStefano Zampini } 3724fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 37257e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 37267e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3727fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 37287e8381f9SStefano Zampini } else { 37297e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 37307e8381f9SStefano Zampini } 3731e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 37327e8381f9SStefano Zampini 37337e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3734e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 3735e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 37367e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 37377e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 37387e8381f9SStefano Zampini A->nonzerostate++; 37397e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3740a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 37417e8381f9SStefano Zampini 37427e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 37437e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 37447e8381f9SStefano Zampini PetscFunctionReturn(0); 37457e8381f9SStefano Zampini } 3746ed502f03SStefano Zampini 3747ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3748ed502f03SStefano Zampini { 3749ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3750ed502f03SStefano Zampini CsrMatrix *csr; 3751ed502f03SStefano Zampini PetscErrorCode ierr; 3752ed502f03SStefano Zampini 3753ed502f03SStefano Zampini PetscFunctionBegin; 3754ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3755ed502f03SStefano Zampini PetscValidPointer(a,2); 3756ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3757ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3758ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 375933c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3760ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3761ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3762ed502f03SStefano Zampini *a = csr->values->data().get(); 3763ed502f03SStefano Zampini PetscFunctionReturn(0); 3764ed502f03SStefano Zampini } 3765ed502f03SStefano Zampini 3766ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3767ed502f03SStefano Zampini { 3768ed502f03SStefano Zampini PetscFunctionBegin; 3769ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3770ed502f03SStefano Zampini PetscValidPointer(a,2); 3771ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3772ed502f03SStefano Zampini *a = NULL; 3773ed502f03SStefano Zampini PetscFunctionReturn(0); 3774ed502f03SStefano Zampini } 3775ed502f03SStefano Zampini 3776039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3777039c6fbaSStefano Zampini { 3778039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3779039c6fbaSStefano Zampini CsrMatrix *csr; 3780039c6fbaSStefano Zampini PetscErrorCode ierr; 3781039c6fbaSStefano Zampini 3782039c6fbaSStefano Zampini PetscFunctionBegin; 3783039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3784039c6fbaSStefano Zampini PetscValidPointer(a,2); 3785039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3786039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3787039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 378833c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3789039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3790039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3791039c6fbaSStefano Zampini *a = csr->values->data().get(); 3792039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3793a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3794039c6fbaSStefano Zampini PetscFunctionReturn(0); 3795039c6fbaSStefano Zampini } 3796039c6fbaSStefano Zampini 3797039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3798039c6fbaSStefano Zampini { 3799039c6fbaSStefano Zampini PetscErrorCode ierr; 3800039c6fbaSStefano Zampini 3801039c6fbaSStefano Zampini PetscFunctionBegin; 3802039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3803039c6fbaSStefano Zampini PetscValidPointer(a,2); 3804039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3805039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3806039c6fbaSStefano Zampini *a = NULL; 3807039c6fbaSStefano Zampini PetscFunctionReturn(0); 3808039c6fbaSStefano Zampini } 3809039c6fbaSStefano Zampini 3810ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3811ed502f03SStefano Zampini { 3812ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3813ed502f03SStefano Zampini CsrMatrix *csr; 3814a49f1ed0SStefano Zampini PetscErrorCode ierr; 3815ed502f03SStefano Zampini 3816ed502f03SStefano Zampini PetscFunctionBegin; 3817ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3818ed502f03SStefano Zampini PetscValidPointer(a,2); 3819ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3820ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 382133c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3822ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3823ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3824ed502f03SStefano Zampini *a = csr->values->data().get(); 3825039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3826a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3827ed502f03SStefano Zampini PetscFunctionReturn(0); 3828ed502f03SStefano Zampini } 3829ed502f03SStefano Zampini 3830ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3831ed502f03SStefano Zampini { 3832ed502f03SStefano Zampini PetscErrorCode ierr; 3833ed502f03SStefano Zampini 3834ed502f03SStefano Zampini PetscFunctionBegin; 3835ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3836ed502f03SStefano Zampini PetscValidPointer(a,2); 3837ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3838ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3839ed502f03SStefano Zampini *a = NULL; 3840ed502f03SStefano Zampini PetscFunctionReturn(0); 3841ed502f03SStefano Zampini } 3842ed502f03SStefano Zampini 3843ed502f03SStefano Zampini struct IJCompare4 3844ed502f03SStefano Zampini { 3845ed502f03SStefano Zampini __host__ __device__ 38462ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3847ed502f03SStefano Zampini { 3848ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 3849ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3850ed502f03SStefano Zampini return false; 3851ed502f03SStefano Zampini } 3852ed502f03SStefano Zampini }; 3853ed502f03SStefano Zampini 38548909a122SStefano Zampini struct Shift 38558909a122SStefano Zampini { 3856ed502f03SStefano Zampini int _shift; 3857ed502f03SStefano Zampini 3858ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 3859ed502f03SStefano Zampini __host__ __device__ 3860ed502f03SStefano Zampini inline int operator() (const int &c) 3861ed502f03SStefano Zampini { 3862ed502f03SStefano Zampini return c + _shift; 3863ed502f03SStefano Zampini } 3864ed502f03SStefano Zampini }; 3865ed502f03SStefano Zampini 3866ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3867ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3868ed502f03SStefano Zampini { 3869ed502f03SStefano Zampini PetscErrorCode ierr; 3870ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3871ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3872ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3873ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3874ed502f03SStefano Zampini PetscInt Annz,Bnnz; 3875ed502f03SStefano Zampini cusparseStatus_t stat; 3876ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 3877ed502f03SStefano Zampini cudaError_t cerr; 3878ed502f03SStefano Zampini 3879ed502f03SStefano Zampini PetscFunctionBegin; 3880ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3881ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3882ed502f03SStefano Zampini PetscValidPointer(C,4); 3883ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3884ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3885ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3886ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3887ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3888ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3889ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 3890ed502f03SStefano Zampini m = A->rmap->n; 3891ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 3892ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3893ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3894ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3895ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 3896ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3897ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3898ed502f03SStefano Zampini Ccsr = new CsrMatrix; 3899ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 3900ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 3901ed502f03SStefano Zampini c->compressedrow.nrows = 0; 3902ed502f03SStefano Zampini c->compressedrow.i = NULL; 3903ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 3904ed502f03SStefano Zampini Ccusp->workVector = NULL; 3905ed502f03SStefano Zampini Ccusp->nrows = m; 3906ed502f03SStefano Zampini Ccusp->mat = Cmat; 3907ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 3908ed502f03SStefano Zampini Ccsr->num_rows = m; 3909ed502f03SStefano Zampini Ccsr->num_cols = n; 3910ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3911ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3912ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3913ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3914ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3915ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3916ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3917ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3918ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3919ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3920ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 39211a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 39221a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3923ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3924ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3925ed502f03SStefano Zampini 3926ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 3927ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3928ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 3929ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 3930ed502f03SStefano Zampini c->nz = Annz + Bnnz; 3931ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3932ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3933ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3934ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 3935ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3936ed502f03SStefano Zampini if (c->nz) { 39372ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 39382ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 39392ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 39402ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 39412ed87e7eSStefano Zampini 3942ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 3943ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 3944ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3945ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3946ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3947ed502f03SStefano Zampini } 39482ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 39492ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 3950ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 3951ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3952ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3953ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3954ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3955ed502f03SStefano Zampini } 39562ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 39572ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 3958ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 39592ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 39602ed87e7eSStefano Zampini Aroff->data().get(), 39612ed87e7eSStefano Zampini Annz, 39622ed87e7eSStefano Zampini m, 39632ed87e7eSStefano Zampini Acoo->data().get(), 39642ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3965ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 39662ed87e7eSStefano Zampini Broff->data().get(), 3967ed502f03SStefano Zampini Bnnz, 3968ed502f03SStefano Zampini m, 39692ed87e7eSStefano Zampini Bcoo->data().get(), 3970ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 39712ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 39722ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 39732ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 39748909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 3975ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3976ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 39778909a122SStefano Zampini #else 39788909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 39798909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 39808909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 39818909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 39828909a122SStefano Zampini #endif 39832ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 39842ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 39852ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 39862ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 39872ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 39882ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 3989ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 3990ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 3991ed502f03SStefano Zampini thrust::advance(p2,Annz); 39922ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 39938909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 39948909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 39958909a122SStefano Zampini #endif 39962ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 39972ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 39982ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 39992ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 40002ed87e7eSStefano Zampini #else 40012ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 40022ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 40032ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 40042ed87e7eSStefano Zampini #endif 4005ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 40062ed87e7eSStefano Zampini Ccoo->data().get(), 4007ed502f03SStefano Zampini c->nz, 4008ed502f03SStefano Zampini m, 4009ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4010ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4011ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 40122ed87e7eSStefano Zampini delete wPerm; 40132ed87e7eSStefano Zampini delete Acoo; 40142ed87e7eSStefano Zampini delete Bcoo; 40152ed87e7eSStefano Zampini delete Ccoo; 4016ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4017ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4018ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4019ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4020ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4021ed502f03SStefano Zampini #endif 40221a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4023ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4024ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4025ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4026ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4027ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4028ed502f03SStefano Zampini 40291a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 40301a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4031a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4032ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4033ed502f03SStefano Zampini CmatT->mat = CcsrT; 4034ed502f03SStefano Zampini CcsrT->num_rows = n; 4035ed502f03SStefano Zampini CcsrT->num_cols = m; 4036ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4037ed502f03SStefano Zampini 4038ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4039ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4040ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4041ed502f03SStefano Zampini 4042ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4043ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4044ed502f03SStefano Zampini if (AT) { 4045ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4046ed502f03SStefano Zampini thrust::advance(rT,-1); 4047ed502f03SStefano Zampini } 4048ed502f03SStefano Zampini if (BT) { 4049ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4050ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4051ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4052ed502f03SStefano Zampini } 4053ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4054ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4055ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4056ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4057ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4058ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4059ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4060ed502f03SStefano Zampini 4061ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4062ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4063ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4064ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4065ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4066ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4067ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4068ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4069ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4070ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4071ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4072ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4073ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4074ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4075ed502f03SStefano Zampini #endif 4076ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4077ed502f03SStefano Zampini } 4078ed502f03SStefano Zampini } 4079ed502f03SStefano Zampini 4080ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4081ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4082ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4083ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4084ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4085ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4086ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4087ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4088ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4089ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4090ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4091ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4092ed502f03SStefano Zampini } else { 4093ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4094ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4095ed502f03SStefano Zampini } 4096ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4097ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4098ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4099ed502f03SStefano Zampini c->maxnz = c->nz; 4100ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4101ed502f03SStefano Zampini c->rmax = 0; 4102ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4103ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4104ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4105ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4106ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4107ed502f03SStefano Zampini } 4108ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4109ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4110ed502f03SStefano Zampini (*C)->nonzerostate++; 4111ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4112ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4113ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4114ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4115ed502f03SStefano Zampini } else { 4116ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4117ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4118ed502f03SStefano Zampini if (c->nz) { 4119ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4120ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4121ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4122ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4123ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4124ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4125ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4126ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4127ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4128ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4129ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4130ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4131ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4132ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4133ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4134ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4135ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4136ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4137ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4138ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4139ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4140ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4141ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4142ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4143ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4144ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4145ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4146ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4147ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4148a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 41491a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4150ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4151ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4152ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4153ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4154ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4155ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4156ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4157ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 41581a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4159ed502f03SStefano Zampini } 4160ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4161ed502f03SStefano Zampini } 4162ed502f03SStefano Zampini } 4163ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4164ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4165ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4166ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4167ed502f03SStefano Zampini PetscFunctionReturn(0); 4168ed502f03SStefano Zampini } 4169c215019aSStefano Zampini 4170c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4171c215019aSStefano Zampini { 4172c215019aSStefano Zampini PetscErrorCode ierr; 4173c215019aSStefano Zampini bool dmem; 4174c215019aSStefano Zampini const PetscScalar *av; 4175c215019aSStefano Zampini cudaError_t cerr; 4176c215019aSStefano Zampini 4177c215019aSStefano Zampini PetscFunctionBegin; 4178c215019aSStefano Zampini dmem = isCudaMem(v); 4179c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4180c215019aSStefano Zampini if (n && idx) { 4181c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4182c215019aSStefano Zampini widx.assign(idx,idx+n); 4183c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4184c215019aSStefano Zampini 4185c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4186c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4187c215019aSStefano Zampini if (dmem) { 4188c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4189c215019aSStefano Zampini } else { 4190c215019aSStefano Zampini w = new THRUSTARRAY(n); 4191c215019aSStefano Zampini dv = w->data(); 4192c215019aSStefano Zampini } 4193c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4194c215019aSStefano Zampini 4195c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4196c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4197c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4198c215019aSStefano Zampini if (w) { 4199c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4200c215019aSStefano Zampini } 4201c215019aSStefano Zampini delete w; 4202c215019aSStefano Zampini } else { 4203c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4204c215019aSStefano Zampini } 4205c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4206c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4207c215019aSStefano Zampini PetscFunctionReturn(0); 4208c215019aSStefano Zampini } 4209