19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92c215019aSStefano Zampini 93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 94b06137fdSPaul Mullowney { 95b06137fdSPaul Mullowney cusparseStatus_t stat; 96b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97b06137fdSPaul Mullowney 98b06137fdSPaul Mullowney PetscFunctionBegin; 99d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 100b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10157d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 102b06137fdSPaul Mullowney PetscFunctionReturn(0); 103b06137fdSPaul Mullowney } 104b06137fdSPaul Mullowney 105b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 106b06137fdSPaul Mullowney { 107b06137fdSPaul Mullowney cusparseStatus_t stat; 108b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109b06137fdSPaul Mullowney 110b06137fdSPaul Mullowney PetscFunctionBegin; 111d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1126b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11316a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11457d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11516a2e217SAlejandro Lamas Daviña } 116b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1176b1cf21dSAlejandro Lamas Daviña } 11857d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 119b06137fdSPaul Mullowney PetscFunctionReturn(0); 120b06137fdSPaul Mullowney } 121b06137fdSPaul Mullowney 122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123b06137fdSPaul Mullowney { 124b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1257e8381f9SStefano Zampini PetscBool flg; 1267e8381f9SStefano Zampini PetscErrorCode ierr; 127ccdfe979SStefano Zampini 128b06137fdSPaul Mullowney PetscFunctionBegin; 1297e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1307e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 131ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 132b06137fdSPaul Mullowney PetscFunctionReturn(0); 133b06137fdSPaul Mullowney } 134b06137fdSPaul Mullowney 135ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1369ae82921SPaul Mullowney { 1379ae82921SPaul Mullowney PetscFunctionBegin; 1389ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1399ae82921SPaul Mullowney PetscFunctionReturn(0); 1409ae82921SPaul Mullowney } 1419ae82921SPaul Mullowney 142c708e6cdSJed Brown /*MC 143087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 144087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 145087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 146087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 147087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 148087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 149c708e6cdSJed Brown 1509ae82921SPaul Mullowney Level: beginner 151c708e6cdSJed Brown 1523ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 153c708e6cdSJed Brown M*/ 1549ae82921SPaul Mullowney 15542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1569ae82921SPaul Mullowney { 1579ae82921SPaul Mullowney PetscErrorCode ierr; 158bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1599ae82921SPaul Mullowney 1609ae82921SPaul Mullowney PetscFunctionBegin; 161bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 162bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1632c7c0729SBarry Smith (*B)->factortype = ftype; 1649ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1652205254eSKarl Rupp 1669c1083e7SRichard Tran Mills if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 167087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16833d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1699c1083e7SRichard Tran Mills if (!A->boundtocpu) { 1709ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1719ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1729c1083e7SRichard Tran Mills } else { 1739c1083e7SRichard Tran Mills (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1749c1083e7SRichard Tran Mills (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1759c1083e7SRichard Tran Mills } 1764ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1774ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 1784ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 179087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1809c1083e7SRichard Tran Mills if (!A->boundtocpu) { 181087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 182087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1839c1083e7SRichard Tran Mills } else { 1849c1083e7SRichard Tran Mills (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1859c1083e7SRichard Tran Mills (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1869c1083e7SRichard Tran Mills } 1874ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 1884ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 1899ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 190bc3f50f2SPaul Mullowney 191fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1924ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1933ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1949ae82921SPaul Mullowney PetscFunctionReturn(0); 1959ae82921SPaul Mullowney } 1969ae82921SPaul Mullowney 197bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 198ca45077fSPaul Mullowney { 199aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2006e111a19SKarl Rupp 201ca45077fSPaul Mullowney PetscFunctionBegin; 202ca45077fSPaul Mullowney switch (op) { 203e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 204aa372e3fSPaul Mullowney cusparsestruct->format = format; 205ca45077fSPaul Mullowney break; 206e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 207aa372e3fSPaul Mullowney cusparsestruct->format = format; 208ca45077fSPaul Mullowney break; 209ca45077fSPaul Mullowney default: 21098921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 211ca45077fSPaul Mullowney } 212ca45077fSPaul Mullowney PetscFunctionReturn(0); 213ca45077fSPaul Mullowney } 2149ae82921SPaul Mullowney 215e057df02SPaul Mullowney /*@ 216e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 217e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 218aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 219e057df02SPaul Mullowney Not Collective 220e057df02SPaul Mullowney 221e057df02SPaul Mullowney Input Parameters: 2228468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 22336d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2242692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 225e057df02SPaul Mullowney 226e057df02SPaul Mullowney Output Parameter: 227e057df02SPaul Mullowney 228e057df02SPaul Mullowney Level: intermediate 229e057df02SPaul Mullowney 2308468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 231e057df02SPaul Mullowney @*/ 232e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 233e057df02SPaul Mullowney { 234e057df02SPaul Mullowney PetscErrorCode ierr; 2356e111a19SKarl Rupp 236e057df02SPaul Mullowney PetscFunctionBegin; 237e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 238e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 239e057df02SPaul Mullowney PetscFunctionReturn(0); 240e057df02SPaul Mullowney } 241e057df02SPaul Mullowney 242365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 243365b711fSMark Adams { 244365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 245365b711fSMark Adams 246365b711fSMark Adams PetscFunctionBegin; 247365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 248365b711fSMark Adams PetscFunctionReturn(0); 249365b711fSMark Adams } 250365b711fSMark Adams 251365b711fSMark Adams /*@ 252365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 253365b711fSMark Adams 254365b711fSMark Adams Input Parameters: 255365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 256365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 257365b711fSMark Adams 258365b711fSMark Adams Output Parameter: 259365b711fSMark Adams 260365b711fSMark Adams Notes: 261365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 262365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 263365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 264365b711fSMark Adams 265365b711fSMark Adams Level: intermediate 266365b711fSMark Adams 267365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 268365b711fSMark Adams @*/ 269365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 270365b711fSMark Adams { 271365b711fSMark Adams PetscErrorCode ierr; 272365b711fSMark Adams 273365b711fSMark Adams PetscFunctionBegin; 274365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 275365b711fSMark Adams ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 276365b711fSMark Adams PetscFunctionReturn(0); 277365b711fSMark Adams } 278365b711fSMark Adams 2791a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 280e6e9a74fSStefano Zampini { 281e6e9a74fSStefano Zampini PetscErrorCode ierr; 282e6e9a74fSStefano Zampini 283e6e9a74fSStefano Zampini PetscFunctionBegin; 2841a2c6b5cSJunchao Zhang switch (op) { 2851a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2861a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2871a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2881a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2891a2c6b5cSJunchao Zhang break; 2901a2c6b5cSJunchao Zhang default: 2911a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2921a2c6b5cSJunchao Zhang break; 293e6e9a74fSStefano Zampini } 294e6e9a74fSStefano Zampini PetscFunctionReturn(0); 295e6e9a74fSStefano Zampini } 296e6e9a74fSStefano Zampini 297bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 298bddcd29dSMark Adams 299bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 300bddcd29dSMark Adams { 301bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 302bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 303bddcd29dSMark Adams PetscBool row_identity,col_identity; 304365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 305bddcd29dSMark Adams PetscErrorCode ierr; 306bddcd29dSMark Adams 307bddcd29dSMark Adams PetscFunctionBegin; 308bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 309bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 310bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 311bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 312bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 313bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 314bddcd29dSMark Adams if (row_identity && col_identity) { 315365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 316bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 317bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 318365b711fSMark Adams } 319bddcd29dSMark Adams B->ops->matsolve = NULL; 320bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 321bddcd29dSMark Adams } else { 322365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 323bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 324bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 325365b711fSMark Adams } 326bddcd29dSMark Adams B->ops->matsolve = NULL; 327bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 328bddcd29dSMark Adams } 329bddcd29dSMark Adams 330bddcd29dSMark Adams /* get the triangular factors */ 331365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 332bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 333365b711fSMark Adams } 334bddcd29dSMark Adams PetscFunctionReturn(0); 335bddcd29dSMark Adams } 336bddcd29dSMark Adams 3374416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 3389ae82921SPaul Mullowney { 3399ae82921SPaul Mullowney PetscErrorCode ierr; 340e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 3419ae82921SPaul Mullowney PetscBool flg; 342a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3436e111a19SKarl Rupp 3449ae82921SPaul Mullowney PetscFunctionBegin; 345e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 3469ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 347e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 348a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 349afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 350afb2bd1cSJunchao Zhang 3514c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 352a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 353afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 354365b711fSMark Adams ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 355365b711fSMark Adams if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 356afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 357afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 358afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 359afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 360a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 361a435da06SStefano Zampini if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 362a435da06SStefano Zampini #else 363afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 364a435da06SStefano Zampini #endif 365afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 366afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 367afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 368afb2bd1cSJunchao Zhang 369afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 370afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 371afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 372afb2bd1cSJunchao Zhang #endif 3734c87dfd4SPaul Mullowney } 3740af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3759ae82921SPaul Mullowney PetscFunctionReturn(0); 3769ae82921SPaul Mullowney } 3779ae82921SPaul Mullowney 3786fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3799ae82921SPaul Mullowney { 380da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3819ae82921SPaul Mullowney PetscErrorCode ierr; 3829ae82921SPaul Mullowney 3839ae82921SPaul Mullowney PetscFunctionBegin; 384da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3859ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3869ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3879ae82921SPaul Mullowney PetscFunctionReturn(0); 3889ae82921SPaul Mullowney } 3899ae82921SPaul Mullowney 3906fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3919ae82921SPaul Mullowney { 392da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3939ae82921SPaul Mullowney PetscErrorCode ierr; 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney PetscFunctionBegin; 396da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3979ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3989ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3999ae82921SPaul Mullowney PetscFunctionReturn(0); 4009ae82921SPaul Mullowney } 4019ae82921SPaul Mullowney 402087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 403087f3262SPaul Mullowney { 404da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 405087f3262SPaul Mullowney PetscErrorCode ierr; 406087f3262SPaul Mullowney 407087f3262SPaul Mullowney PetscFunctionBegin; 408da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 409087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 410087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 411087f3262SPaul Mullowney PetscFunctionReturn(0); 412087f3262SPaul Mullowney } 413087f3262SPaul Mullowney 414087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 415087f3262SPaul Mullowney { 416da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 417087f3262SPaul Mullowney PetscErrorCode ierr; 418087f3262SPaul Mullowney 419087f3262SPaul Mullowney PetscFunctionBegin; 420da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 421087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 422087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 423087f3262SPaul Mullowney PetscFunctionReturn(0); 424087f3262SPaul Mullowney } 425087f3262SPaul Mullowney 426087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 4279ae82921SPaul Mullowney { 4289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4299ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4309ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 431aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 4329ae82921SPaul Mullowney cusparseStatus_t stat; 4339ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 4349ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4359ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 4369ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 437b175d8bbSPaul Mullowney PetscErrorCode ierr; 43857d48284SJunchao Zhang cudaError_t cerr; 4399ae82921SPaul Mullowney 4409ae82921SPaul Mullowney PetscFunctionBegin; 441cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 442c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4439ae82921SPaul Mullowney try { 4449ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 4459ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 446da79fbbcSStefano Zampini if (!loTriFactor) { 4472cbc15d9SMark PetscScalar *AALo; 4482cbc15d9SMark 4492cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4509ae82921SPaul Mullowney 4519ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 45257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 45357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 4549ae82921SPaul Mullowney 4559ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 4569ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 4579ae82921SPaul Mullowney AiLo[n] = nzLower; 4589ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4599ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4609ae82921SPaul Mullowney v = aa; 4619ae82921SPaul Mullowney vi = aj; 4629ae82921SPaul Mullowney offset = 1; 4639ae82921SPaul Mullowney rowOffset= 1; 4649ae82921SPaul Mullowney for (i=1; i<n; i++) { 4659ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 466e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4679ae82921SPaul Mullowney AiLo[i] = rowOffset; 4689ae82921SPaul Mullowney rowOffset += nz+1; 4699ae82921SPaul Mullowney 470580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 471580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4729ae82921SPaul Mullowney 4739ae82921SPaul Mullowney offset += nz; 4749ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4759ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4769ae82921SPaul Mullowney offset += 1; 4779ae82921SPaul Mullowney 4789ae82921SPaul Mullowney v += nz; 4799ae82921SPaul Mullowney vi += nz; 4809ae82921SPaul Mullowney } 4812205254eSKarl Rupp 482aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 483da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 484da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 485aa372e3fSPaul Mullowney /* Create the matrix description */ 48657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 48757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4881b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 489afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 490afb2bd1cSJunchao Zhang #else 49157d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 492afb2bd1cSJunchao Zhang #endif 49357d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 49457d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 495aa372e3fSPaul Mullowney 496aa372e3fSPaul Mullowney /* set the operation */ 497aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 498aa372e3fSPaul Mullowney 499aa372e3fSPaul Mullowney /* set the matrix */ 500aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 501aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 502aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 503aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 504aa372e3fSPaul Mullowney 505aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 506aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 507aa372e3fSPaul Mullowney 508aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 509aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 510aa372e3fSPaul Mullowney 511aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 512aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 513aa372e3fSPaul Mullowney 514afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 515da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 516afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 518afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 519afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 520afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 521afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 522afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 523afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 524afb2bd1cSJunchao Zhang #endif 525afb2bd1cSJunchao Zhang 526aa372e3fSPaul Mullowney /* perform the solve analysis */ 527aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 528aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 529aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 530d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 5311b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 532d49cd2b7SBarry Smith loTriFactor->solveInfo, 533d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 534d49cd2b7SBarry Smith #else 535d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 536afb2bd1cSJunchao Zhang #endif 537da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 538da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 539aa372e3fSPaul Mullowney 540da79fbbcSStefano Zampini /* assign the pointer */ 541aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 5422cbc15d9SMark loTriFactor->AA_h = AALo; 54357d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 54457d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 5454863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 546da79fbbcSStefano Zampini } else { /* update values only */ 5472cbc15d9SMark if (!loTriFactor->AA_h) { 5482cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 5492cbc15d9SMark } 550da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 5512cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 552da79fbbcSStefano Zampini v = aa; 553da79fbbcSStefano Zampini vi = aj; 554da79fbbcSStefano Zampini offset = 1; 555da79fbbcSStefano Zampini for (i=1; i<n; i++) { 556da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 5572cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 558da79fbbcSStefano Zampini offset += nz; 5592cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 560da79fbbcSStefano Zampini offset += 1; 561da79fbbcSStefano Zampini v += nz; 562da79fbbcSStefano Zampini } 5632cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 564da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 565da79fbbcSStefano Zampini } 5669ae82921SPaul Mullowney } catch(char *ex) { 56798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5689ae82921SPaul Mullowney } 5699ae82921SPaul Mullowney } 5709ae82921SPaul Mullowney PetscFunctionReturn(0); 5719ae82921SPaul Mullowney } 5729ae82921SPaul Mullowney 573087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5749ae82921SPaul Mullowney { 5759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5769ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5779ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 578aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5799ae82921SPaul Mullowney cusparseStatus_t stat; 5809ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5819ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5829ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5839ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5849ae82921SPaul Mullowney PetscErrorCode ierr; 58557d48284SJunchao Zhang cudaError_t cerr; 5869ae82921SPaul Mullowney 5879ae82921SPaul Mullowney PetscFunctionBegin; 588cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 589c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5909ae82921SPaul Mullowney try { 5919ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5929ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 593da79fbbcSStefano Zampini if (!upTriFactor) { 5942cbc15d9SMark PetscScalar *AAUp; 5952cbc15d9SMark 5962cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5972cbc15d9SMark 5989ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 59957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 60057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 6019ae82921SPaul Mullowney 6029ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 6039ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 6049ae82921SPaul Mullowney AiUp[n]=nzUpper; 6059ae82921SPaul Mullowney offset = nzUpper; 6069ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 6079ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 6089ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 6099ae82921SPaul Mullowney 610e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 6119ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 6129ae82921SPaul Mullowney 613e057df02SPaul Mullowney /* decrement the offset */ 6149ae82921SPaul Mullowney offset -= (nz+1); 6159ae82921SPaul Mullowney 616e057df02SPaul Mullowney /* first, set the diagonal elements */ 6179ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 61809f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 6199ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 6209ae82921SPaul Mullowney 621580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 622580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 6239ae82921SPaul Mullowney } 6242205254eSKarl Rupp 625aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 626da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 627da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 6282205254eSKarl Rupp 629aa372e3fSPaul Mullowney /* Create the matrix description */ 63057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 63157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 6321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 633afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 634afb2bd1cSJunchao Zhang #else 63557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 636afb2bd1cSJunchao Zhang #endif 63757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 63857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 639aa372e3fSPaul Mullowney 640aa372e3fSPaul Mullowney /* set the operation */ 641aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 642aa372e3fSPaul Mullowney 643aa372e3fSPaul Mullowney /* set the matrix */ 644aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 645aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 646aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 647aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 648aa372e3fSPaul Mullowney 649aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 650aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 651aa372e3fSPaul Mullowney 652aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 653aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 654aa372e3fSPaul Mullowney 655aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 656aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 657aa372e3fSPaul Mullowney 658afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 659da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 660afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 6611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 662afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 663afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 664afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 665afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 666afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 667afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 668afb2bd1cSJunchao Zhang #endif 669afb2bd1cSJunchao Zhang 670aa372e3fSPaul Mullowney /* perform the solve analysis */ 671aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 672aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 673aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 674d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 676d49cd2b7SBarry Smith upTriFactor->solveInfo, 677d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 678d49cd2b7SBarry Smith #else 679d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 680afb2bd1cSJunchao Zhang #endif 681da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 682da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 683aa372e3fSPaul Mullowney 684da79fbbcSStefano Zampini /* assign the pointer */ 685aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6862cbc15d9SMark upTriFactor->AA_h = AAUp; 68757d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 68857d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6894863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 690da79fbbcSStefano Zampini } else { 6912cbc15d9SMark if (!upTriFactor->AA_h) { 6922cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6932cbc15d9SMark } 694da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 695da79fbbcSStefano Zampini offset = nzUpper; 696da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 697da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 698da79fbbcSStefano Zampini 699da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 700da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 701da79fbbcSStefano Zampini 702da79fbbcSStefano Zampini /* decrement the offset */ 703da79fbbcSStefano Zampini offset -= (nz+1); 704da79fbbcSStefano Zampini 705da79fbbcSStefano Zampini /* first, set the diagonal elements */ 7062cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 7072cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 708da79fbbcSStefano Zampini } 7092cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 710da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 711da79fbbcSStefano Zampini } 7129ae82921SPaul Mullowney } catch(char *ex) { 71398921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7149ae82921SPaul Mullowney } 7159ae82921SPaul Mullowney } 7169ae82921SPaul Mullowney PetscFunctionReturn(0); 7179ae82921SPaul Mullowney } 7189ae82921SPaul Mullowney 719087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 7209ae82921SPaul Mullowney { 7219ae82921SPaul Mullowney PetscErrorCode ierr; 7229ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 7239ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7249ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 7259ae82921SPaul Mullowney PetscBool row_identity,col_identity; 7269ae82921SPaul Mullowney PetscInt n = A->rmap->n; 7279ae82921SPaul Mullowney 7289ae82921SPaul Mullowney PetscFunctionBegin; 729da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 730087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 731087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 7322205254eSKarl Rupp 733da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 734aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 7359ae82921SPaul Mullowney 736c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 737e057df02SPaul Mullowney /* lower triangular indices */ 7389ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 739da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 740da79fbbcSStefano Zampini const PetscInt *r; 741da79fbbcSStefano Zampini 742da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 743aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 744aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 7459ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 746da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 747da79fbbcSStefano Zampini } 7489ae82921SPaul Mullowney 749e057df02SPaul Mullowney /* upper triangular indices */ 7509ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 751da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 752da79fbbcSStefano Zampini const PetscInt *c; 753da79fbbcSStefano Zampini 754da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 755aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 756aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 7579ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 758da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 759da79fbbcSStefano Zampini } 7609ae82921SPaul Mullowney PetscFunctionReturn(0); 7619ae82921SPaul Mullowney } 7629ae82921SPaul Mullowney 763087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 764087f3262SPaul Mullowney { 765087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 766087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 767aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 768aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 769087f3262SPaul Mullowney cusparseStatus_t stat; 770087f3262SPaul Mullowney PetscErrorCode ierr; 77157d48284SJunchao Zhang cudaError_t cerr; 772087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 773087f3262SPaul Mullowney PetscScalar *AAUp; 774087f3262SPaul Mullowney PetscScalar *AALo; 775087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 776087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 777087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 778087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 779087f3262SPaul Mullowney 780087f3262SPaul Mullowney PetscFunctionBegin; 781cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 782c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 783087f3262SPaul Mullowney try { 784da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 785da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 786da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 787087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 78857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 78957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 790087f3262SPaul Mullowney 791087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 792087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 793087f3262SPaul Mullowney AiUp[n]=nzUpper; 794087f3262SPaul Mullowney offset = 0; 795087f3262SPaul Mullowney for (i=0; i<n; i++) { 796087f3262SPaul Mullowney /* set the pointers */ 797087f3262SPaul Mullowney v = aa + ai[i]; 798087f3262SPaul Mullowney vj = aj + ai[i]; 799087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 800087f3262SPaul Mullowney 801087f3262SPaul Mullowney /* first, set the diagonal elements */ 802087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 80309f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 804087f3262SPaul Mullowney AiUp[i] = offset; 80509f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 806087f3262SPaul Mullowney 807087f3262SPaul Mullowney offset+=1; 808087f3262SPaul Mullowney if (nz>0) { 809f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 810580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 811087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 812087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 813087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 814087f3262SPaul Mullowney } 815087f3262SPaul Mullowney offset+=nz; 816087f3262SPaul Mullowney } 817087f3262SPaul Mullowney } 818087f3262SPaul Mullowney 819aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 820da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 821da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822087f3262SPaul Mullowney 823aa372e3fSPaul Mullowney /* Create the matrix description */ 82457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 82557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8261b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 827afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 828afb2bd1cSJunchao Zhang #else 82957d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 830afb2bd1cSJunchao Zhang #endif 83157d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 83257d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 833087f3262SPaul Mullowney 834aa372e3fSPaul Mullowney /* set the matrix */ 835aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 836aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 837aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 838aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 839aa372e3fSPaul Mullowney 840aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 841aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 842aa372e3fSPaul Mullowney 843aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 844aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 845aa372e3fSPaul Mullowney 846aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 847aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 848aa372e3fSPaul Mullowney 849afb2bd1cSJunchao Zhang /* set the operation */ 850afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 851afb2bd1cSJunchao Zhang 852afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 853da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 854afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 856afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 857afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 858afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 859afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 860afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 861afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 862afb2bd1cSJunchao Zhang #endif 863afb2bd1cSJunchao Zhang 864aa372e3fSPaul Mullowney /* perform the solve analysis */ 865aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 866aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 867aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 868d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8691b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 870d49cd2b7SBarry Smith upTriFactor->solveInfo, 871d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 872d49cd2b7SBarry Smith #else 873d49cd2b7SBarry Smith upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 874afb2bd1cSJunchao Zhang #endif 875da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 876da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 877aa372e3fSPaul Mullowney 878da79fbbcSStefano Zampini /* assign the pointer */ 879aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 880aa372e3fSPaul Mullowney 881aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 882da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 883da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 884aa372e3fSPaul Mullowney 885aa372e3fSPaul Mullowney /* Create the matrix description */ 88657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 88757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8881b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 889afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 890afb2bd1cSJunchao Zhang #else 89157d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 892afb2bd1cSJunchao Zhang #endif 89357d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 89457d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 895aa372e3fSPaul Mullowney 896aa372e3fSPaul Mullowney /* set the operation */ 897aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 898aa372e3fSPaul Mullowney 899aa372e3fSPaul Mullowney /* set the matrix */ 900aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 901aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 902aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 903aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 904aa372e3fSPaul Mullowney 905aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 906aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 907aa372e3fSPaul Mullowney 908aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 909aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 910aa372e3fSPaul Mullowney 911aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 912aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 913aa372e3fSPaul Mullowney 914afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 915da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 916afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 9171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 918afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 919afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 920afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 921afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 922afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 923afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 924afb2bd1cSJunchao Zhang #endif 925afb2bd1cSJunchao Zhang 926aa372e3fSPaul Mullowney /* perform the solve analysis */ 927aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 928aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 929aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 930d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 9311b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 932d49cd2b7SBarry Smith loTriFactor->solveInfo, 933d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 934d49cd2b7SBarry Smith #else 935d49cd2b7SBarry Smith loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 936afb2bd1cSJunchao Zhang #endif 937da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 938da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 939aa372e3fSPaul Mullowney 940da79fbbcSStefano Zampini /* assign the pointer */ 941aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 942087f3262SPaul Mullowney 943da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 94457d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 94557d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 946da79fbbcSStefano Zampini } else { 947da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 948da79fbbcSStefano Zampini offset = 0; 949da79fbbcSStefano Zampini for (i=0; i<n; i++) { 950da79fbbcSStefano Zampini /* set the pointers */ 951da79fbbcSStefano Zampini v = aa + ai[i]; 952da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 953da79fbbcSStefano Zampini 954da79fbbcSStefano Zampini /* first, set the diagonal elements */ 955da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 956da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 957da79fbbcSStefano Zampini 958da79fbbcSStefano Zampini offset+=1; 959da79fbbcSStefano Zampini if (nz>0) { 960da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 961da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 962da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 963da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 964da79fbbcSStefano Zampini } 965da79fbbcSStefano Zampini offset+=nz; 966da79fbbcSStefano Zampini } 967da79fbbcSStefano Zampini } 968da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 969da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 970da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 971da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 972da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 973da79fbbcSStefano Zampini } 97457d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 97557d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 976087f3262SPaul Mullowney } catch(char *ex) { 97798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 978087f3262SPaul Mullowney } 979087f3262SPaul Mullowney } 980087f3262SPaul Mullowney PetscFunctionReturn(0); 981087f3262SPaul Mullowney } 982087f3262SPaul Mullowney 983087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9849ae82921SPaul Mullowney { 9859ae82921SPaul Mullowney PetscErrorCode ierr; 986087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 987087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 988087f3262SPaul Mullowney IS ip = a->row; 989087f3262SPaul Mullowney PetscBool perm_identity; 990087f3262SPaul Mullowney PetscInt n = A->rmap->n; 991087f3262SPaul Mullowney 992087f3262SPaul Mullowney PetscFunctionBegin; 993da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 994087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 995da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 996aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 997aa372e3fSPaul Mullowney 998da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 999da79fbbcSStefano Zampini 1000087f3262SPaul Mullowney /* lower triangular indices */ 1001087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1002087f3262SPaul Mullowney if (!perm_identity) { 10034e4bbfaaSStefano Zampini IS iip; 1004da79fbbcSStefano Zampini const PetscInt *irip,*rip; 10054e4bbfaaSStefano Zampini 10064e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 10074e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1008da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1009aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1010aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1011aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 10124e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 10134e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 10144e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 1015087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1016da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1017da79fbbcSStefano Zampini } 1018087f3262SPaul Mullowney PetscFunctionReturn(0); 1019087f3262SPaul Mullowney } 1020087f3262SPaul Mullowney 1021087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1022087f3262SPaul Mullowney { 1023087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1024087f3262SPaul Mullowney IS ip = b->row; 1025087f3262SPaul Mullowney PetscBool perm_identity; 1026b175d8bbSPaul Mullowney PetscErrorCode ierr; 1027087f3262SPaul Mullowney 1028087f3262SPaul Mullowney PetscFunctionBegin; 102957181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1030087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1031ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1032087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 1033087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1034087f3262SPaul Mullowney if (perm_identity) { 1035087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1036087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 10374e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10384e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1039087f3262SPaul Mullowney } else { 1040087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1041087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 10424e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10434e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1044087f3262SPaul Mullowney } 1045087f3262SPaul Mullowney 1046087f3262SPaul Mullowney /* get the triangular factors */ 1047087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1048087f3262SPaul Mullowney PetscFunctionReturn(0); 1049087f3262SPaul Mullowney } 10509ae82921SPaul Mullowney 1051b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1052bda325fcSPaul Mullowney { 1053bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1054aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1055aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1056da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1057da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1058bda325fcSPaul Mullowney cusparseStatus_t stat; 1059aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1060aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1061aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1062aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10631b0a6780SStefano Zampini cudaError_t cerr; 1064da79fbbcSStefano Zampini PetscErrorCode ierr; 1065b175d8bbSPaul Mullowney 1066bda325fcSPaul Mullowney PetscFunctionBegin; 1067aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1068da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1069da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1070aa372e3fSPaul Mullowney 1071aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1072aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1073aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1074aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1075aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1076aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1077aa372e3fSPaul Mullowney 1078aa372e3fSPaul Mullowney /* Create the matrix description */ 107957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 108057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 108157d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 108257d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 108357d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1084aa372e3fSPaul Mullowney 1085aa372e3fSPaul Mullowney /* set the operation */ 1086aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1087aa372e3fSPaul Mullowney 1088aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1089aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1090afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1091afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1092aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1093afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1094afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1095afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1096aa372e3fSPaul Mullowney 1097aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1098afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1099afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1100afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1101afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1102afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1103afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1104afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1105afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1106afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1107afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 11081b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1109afb2bd1cSJunchao Zhang #endif 1110afb2bd1cSJunchao Zhang 1111da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1112aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1113aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1114aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1115aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1116aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1117aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1118afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1119afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1120afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1121d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1122afb2bd1cSJunchao Zhang #else 1123afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1124d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1125afb2bd1cSJunchao Zhang #endif 1126da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1127da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1128aa372e3fSPaul Mullowney 1129afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1130da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1131afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1133afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1134afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1135afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1136afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1137afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1138afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1139afb2bd1cSJunchao Zhang #endif 1140afb2bd1cSJunchao Zhang 1141afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1142aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1143afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1144afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1145d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 11461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1147d49cd2b7SBarry Smith loTriFactorT->solveInfo, 1148d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1149d49cd2b7SBarry Smith #else 1150d49cd2b7SBarry Smith loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1151afb2bd1cSJunchao Zhang #endif 1152da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1153da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1154aa372e3fSPaul Mullowney 1155da79fbbcSStefano Zampini /* assign the pointer */ 1156aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1157aa372e3fSPaul Mullowney 1158aa372e3fSPaul Mullowney /*********************************************/ 1159aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1160aa372e3fSPaul Mullowney /*********************************************/ 1161aa372e3fSPaul Mullowney 1162aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1163da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1164da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1165aa372e3fSPaul Mullowney 1166aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1167aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1168aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1169aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1170aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1171aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1172aa372e3fSPaul Mullowney 1173aa372e3fSPaul Mullowney /* Create the matrix description */ 117457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 117557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 117657d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 117757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 117857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1179aa372e3fSPaul Mullowney 1180aa372e3fSPaul Mullowney /* set the operation */ 1181aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1182aa372e3fSPaul Mullowney 1183aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1184aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1185afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1186afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1187aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1188afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1189afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1190afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1191aa372e3fSPaul Mullowney 1192aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1193afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1194afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1195afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1196afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1197afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1198afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1199afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1200afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1201afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1202afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1203afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1204afb2bd1cSJunchao Zhang #endif 1205afb2bd1cSJunchao Zhang 1206da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1207aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1208aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1209aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1210aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1211aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1212aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1213afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1214afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1215afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1216d49cd2b7SBarry Smith CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1217afb2bd1cSJunchao Zhang #else 1218afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1219d49cd2b7SBarry Smith CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1220afb2bd1cSJunchao Zhang #endif 1221d49cd2b7SBarry Smith 1222da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1223da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224aa372e3fSPaul Mullowney 1225afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1226da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1227afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 12281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1229afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1230afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1231afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1232afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1233afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1234afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1235afb2bd1cSJunchao Zhang #endif 1236afb2bd1cSJunchao Zhang 1237afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1238aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1239afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1240afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1241d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 12421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1243d49cd2b7SBarry Smith upTriFactorT->solveInfo, 1244d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1245d49cd2b7SBarry Smith #else 1246d49cd2b7SBarry Smith upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1247afb2bd1cSJunchao Zhang #endif 1248d49cd2b7SBarry Smith 1249da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1250da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1251aa372e3fSPaul Mullowney 1252da79fbbcSStefano Zampini /* assign the pointer */ 1253aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1254bda325fcSPaul Mullowney PetscFunctionReturn(0); 1255bda325fcSPaul Mullowney } 1256bda325fcSPaul Mullowney 1257a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1258a49f1ed0SStefano Zampini { 1259a49f1ed0SStefano Zampini __host__ __device__ 1260a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1261a49f1ed0SStefano Zampini { 1262a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1263a49f1ed0SStefano Zampini } 1264a49f1ed0SStefano Zampini }; 1265a49f1ed0SStefano Zampini 12663606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1267bda325fcSPaul Mullowney { 1268aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1269a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1270bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1271bda325fcSPaul Mullowney cusparseStatus_t stat; 1272aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1273b06137fdSPaul Mullowney cudaError_t err; 127485ba7357SStefano Zampini PetscErrorCode ierr; 1275b175d8bbSPaul Mullowney 1276bda325fcSPaul Mullowney PetscFunctionBegin; 1277a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1278a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1279e8d2b73aSMark Adams if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1280a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1281e8d2b73aSMark Adams if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12821a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 128385ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1284ee7b52eaSHong Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1285a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1286a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1287a49f1ed0SStefano Zampini } 1288a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1289aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 129057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1291aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 129257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 129357d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1294aa372e3fSPaul Mullowney 1295b06137fdSPaul Mullowney /* set alpha and beta */ 1296afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12977656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12987656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1299afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 13007656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 13017656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1302b06137fdSPaul Mullowney 1303aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1304aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1305a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1306554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1307554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1308aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1309a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1310aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1311aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1312a3fdcf43SKarl Rupp 1313039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 131481902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1315afb2bd1cSJunchao Zhang 1316afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 13173606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1318afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1319afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1320afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1321afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1322afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1323afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 13243606e59fSJunchao Zhang #else 13253606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 13263606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 13273606e59fSJunchao Zhang 13283606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 13293606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 13303606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 13313606e59fSJunchao Zhang */ 13323606e59fSJunchao Zhang if (matrixT->num_entries) { 13333606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 13343606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 13353606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 13363606e59fSJunchao Zhang matrixT->values->data().get(), 13373606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 13383606e59fSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 13393606e59fSJunchao Zhang 13403606e59fSJunchao Zhang } else { 13413606e59fSJunchao Zhang matstructT->matDescr = NULL; 13423606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13433606e59fSJunchao Zhang } 13443606e59fSJunchao Zhang #endif 1345afb2bd1cSJunchao Zhang #endif 1346aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1347afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1348afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1349afb2bd1cSJunchao Zhang #else 1350aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 135151c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 135251c6d536SStefano Zampini /* First convert HYB to CSR */ 1353aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1354aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1355aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1356aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1357aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1358aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1359aa372e3fSPaul Mullowney 1360aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1361aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1362aa372e3fSPaul Mullowney temp->values->data().get(), 1363aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 136457d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1365aa372e3fSPaul Mullowney 1366aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1367aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1368aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1369aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1370aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1371aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1372aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1373aa372e3fSPaul Mullowney 1374aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1375aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1376aa372e3fSPaul Mullowney temp->values->data().get(), 1377aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1378aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1379aa372e3fSPaul Mullowney tempT->values->data().get(), 1380aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1381aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 138257d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1383aa372e3fSPaul Mullowney 1384aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1385aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 138657d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1387aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1388aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1389aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1390aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1391aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1392aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 139357d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1394aa372e3fSPaul Mullowney 1395aa372e3fSPaul Mullowney /* assign the pointer */ 1396aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13971a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1398aa372e3fSPaul Mullowney /* delete temporaries */ 1399aa372e3fSPaul Mullowney if (tempT) { 1400aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1401aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1402aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1403aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1404087f3262SPaul Mullowney } 1405aa372e3fSPaul Mullowney if (temp) { 1406aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1407aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1408aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1409aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1410aa372e3fSPaul Mullowney } 1411afb2bd1cSJunchao Zhang #endif 1412aa372e3fSPaul Mullowney } 1413a49f1ed0SStefano Zampini } 1414a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1415a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1416a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1417e8d2b73aSMark Adams if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1418e8d2b73aSMark Adams if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1419e8d2b73aSMark Adams if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1420e8d2b73aSMark Adams if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1421e8d2b73aSMark Adams if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1422e8d2b73aSMark Adams if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1423e8d2b73aSMark Adams if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1424e8d2b73aSMark Adams if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1425a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1426a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1427a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1428a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1429a49f1ed0SStefano Zampini } 1430a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1431a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1432a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1433a49f1ed0SStefano Zampini 1434a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1435a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1436a49f1ed0SStefano Zampini void *csr2cscBuffer; 1437a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1438a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1439a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1440a49f1ed0SStefano Zampini matrix->values->data().get(), 1441a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1442a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1443a49f1ed0SStefano Zampini matrixT->values->data().get(), 1444a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1445a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1446a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1447a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1448a49f1ed0SStefano Zampini #endif 1449a49f1ed0SStefano Zampini 14501a2c6b5cSJunchao Zhang if (matrix->num_entries) { 14511a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 14521a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 14531a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 14541a2c6b5cSJunchao Zhang 14551a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 14561a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 14571a2c6b5cSJunchao Zhang */ 14581a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 14591a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 14601a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 14611a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 14621a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1463a49f1ed0SStefano Zampini matrixT->values->data().get(), 1464a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1465a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1466a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 14671a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1468a49f1ed0SStefano Zampini #else 1469a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14701a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1471a49f1ed0SStefano Zampini #endif 14721a2c6b5cSJunchao Zhang } else { 14731a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14741a2c6b5cSJunchao Zhang } 14751a2c6b5cSJunchao Zhang 1476a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1477a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1478a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1479a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1480a49f1ed0SStefano Zampini #endif 1481a49f1ed0SStefano Zampini } 1482a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1483a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1484a49f1ed0SStefano Zampini matrixT->values->begin())); 1485a49f1ed0SStefano Zampini } 1486ee7b52eaSHong Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 148785ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1488213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1489213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1490aa372e3fSPaul Mullowney /* assign the pointer */ 1491aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14921a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1493bda325fcSPaul Mullowney PetscFunctionReturn(0); 1494bda325fcSPaul Mullowney } 1495bda325fcSPaul Mullowney 1496a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14976fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1498bda325fcSPaul Mullowney { 1499c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1500465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1501465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1502465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1503465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1504bda325fcSPaul Mullowney cusparseStatus_t stat; 1505bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1506aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1507aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1508aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1509b175d8bbSPaul Mullowney PetscErrorCode ierr; 1510bda325fcSPaul Mullowney 1511bda325fcSPaul Mullowney PetscFunctionBegin; 1512aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1513aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1514bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1515aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1516aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1517bda325fcSPaul Mullowney } 1518bda325fcSPaul Mullowney 1519bda325fcSPaul Mullowney /* Get the GPU pointers */ 1520c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1521c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1522c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1523c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1524bda325fcSPaul Mullowney 15257a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1526aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1527a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1528c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1529c41cb2e2SAlejandro Lamas Daviña xGPU); 1530aa372e3fSPaul Mullowney 1531aa372e3fSPaul Mullowney /* First, solve U */ 1532aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1533afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1535afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1536afb2bd1cSJunchao Zhang #endif 1537afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1538aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1539aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1540aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1541aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1542d49cd2b7SBarry Smith xarray, 15431b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1544d49cd2b7SBarry Smith tempGPU->data().get(), 1545d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1546d49cd2b7SBarry Smith #else 1547d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1548afb2bd1cSJunchao Zhang #endif 1549aa372e3fSPaul Mullowney 1550aa372e3fSPaul Mullowney /* Then, solve L */ 1551aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1552afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15531b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1554afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1555afb2bd1cSJunchao Zhang #endif 1556afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1557aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1558aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1559aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1560aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1561d49cd2b7SBarry Smith tempGPU->data().get(), 15621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1563d49cd2b7SBarry Smith xarray, 1564d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1565d49cd2b7SBarry Smith #else 1566d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1567afb2bd1cSJunchao Zhang #endif 1568aa372e3fSPaul Mullowney 1569aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1570a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1571c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1572aa372e3fSPaul Mullowney tempGPU->begin()); 1573aa372e3fSPaul Mullowney 1574aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1575a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1576bda325fcSPaul Mullowney 1577bda325fcSPaul Mullowney /* restore */ 1578c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1580661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1581958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1582bda325fcSPaul Mullowney PetscFunctionReturn(0); 1583bda325fcSPaul Mullowney } 1584bda325fcSPaul Mullowney 15856fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1586bda325fcSPaul Mullowney { 1587465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1588465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1589bda325fcSPaul Mullowney cusparseStatus_t stat; 1590bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1591aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1592aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1593aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1594b175d8bbSPaul Mullowney PetscErrorCode ierr; 1595bda325fcSPaul Mullowney 1596bda325fcSPaul Mullowney PetscFunctionBegin; 1597aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1598aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1599bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1600aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1601aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1602bda325fcSPaul Mullowney } 1603bda325fcSPaul Mullowney 1604bda325fcSPaul Mullowney /* Get the GPU pointers */ 1605c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1606c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1607bda325fcSPaul Mullowney 16087a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1609aa372e3fSPaul Mullowney /* First, solve U */ 1610aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1611afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 16121b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1614afb2bd1cSJunchao Zhang #endif 1615afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1616aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1617aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1618aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1619aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1620d49cd2b7SBarry Smith barray, 16211b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1622d49cd2b7SBarry Smith tempGPU->data().get(), 1623d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1624d49cd2b7SBarry Smith #else 1625d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1626afb2bd1cSJunchao Zhang #endif 1627aa372e3fSPaul Mullowney 1628aa372e3fSPaul Mullowney /* Then, solve L */ 1629aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1630afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 16311b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1632afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1633afb2bd1cSJunchao Zhang #endif 1634afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1635aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1636aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1637aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1638aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1639d49cd2b7SBarry Smith tempGPU->data().get(), 16401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1641d49cd2b7SBarry Smith xarray, 1642d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1643d49cd2b7SBarry Smith #else 1644d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1645afb2bd1cSJunchao Zhang #endif 1646bda325fcSPaul Mullowney 1647bda325fcSPaul Mullowney /* restore */ 1648c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1649c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1650661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1651958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1652bda325fcSPaul Mullowney PetscFunctionReturn(0); 1653bda325fcSPaul Mullowney } 1654bda325fcSPaul Mullowney 16556fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 16569ae82921SPaul Mullowney { 1657465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1658465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1659465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1660465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16619ae82921SPaul Mullowney cusparseStatus_t stat; 16629ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1663aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1664aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1665aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1666b175d8bbSPaul Mullowney PetscErrorCode ierr; 16679ae82921SPaul Mullowney 16689ae82921SPaul Mullowney PetscFunctionBegin; 1669ebc8f436SDominic Meiser 1670e057df02SPaul Mullowney /* Get the GPU pointers */ 1671c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1672c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1673c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1674c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16759ae82921SPaul Mullowney 16767a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1677aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1678a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1679c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16804e4bbfaaSStefano Zampini tempGPU->begin()); 1681aa372e3fSPaul Mullowney 1682aa372e3fSPaul Mullowney /* Next, solve L */ 1683aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1684afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1686afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1687afb2bd1cSJunchao Zhang #endif 1688afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1689aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1690aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1691aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1692aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1693d49cd2b7SBarry Smith tempGPU->data().get(), 16941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1695d49cd2b7SBarry Smith xarray, 1696d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1697d49cd2b7SBarry Smith #else 1698d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1699afb2bd1cSJunchao Zhang #endif 1700aa372e3fSPaul Mullowney 1701aa372e3fSPaul Mullowney /* Then, solve U */ 1702aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1703afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1705afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1706afb2bd1cSJunchao Zhang #endif 1707afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1708aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1709aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1710aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1711d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 17121b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1713d49cd2b7SBarry Smith tempGPU->data().get(), 1714d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1715d49cd2b7SBarry Smith #else 1716d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1717afb2bd1cSJunchao Zhang #endif 1718d49cd2b7SBarry Smith 17194e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1720a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 17214e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 17224e4bbfaaSStefano Zampini xGPU); 17239ae82921SPaul Mullowney 1724c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1725c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1726661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1727958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17289ae82921SPaul Mullowney PetscFunctionReturn(0); 17299ae82921SPaul Mullowney } 17309ae82921SPaul Mullowney 17316fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 17329ae82921SPaul Mullowney { 1733465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1734465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 17359ae82921SPaul Mullowney cusparseStatus_t stat; 17369ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1737aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1738aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1739aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1740b175d8bbSPaul Mullowney PetscErrorCode ierr; 17419ae82921SPaul Mullowney 17429ae82921SPaul Mullowney PetscFunctionBegin; 1743e057df02SPaul Mullowney /* Get the GPU pointers */ 1744c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1745c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 17469ae82921SPaul Mullowney 17477a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1748aa372e3fSPaul Mullowney /* First, solve L */ 1749aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1750afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 17511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1752afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1753afb2bd1cSJunchao Zhang #endif 1754afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1755aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1756aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1757aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1758aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1759d49cd2b7SBarry Smith barray, 17601b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1761d49cd2b7SBarry Smith tempGPU->data().get(), 1762d49cd2b7SBarry Smith loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1763d49cd2b7SBarry Smith #else 1764d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1765afb2bd1cSJunchao Zhang #endif 1766d49cd2b7SBarry Smith 1767aa372e3fSPaul Mullowney /* Next, solve U */ 1768aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1769afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1771afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1772afb2bd1cSJunchao Zhang #endif 1773afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1774aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1775aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1776aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1777aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1778d49cd2b7SBarry Smith tempGPU->data().get(), 17791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1780d49cd2b7SBarry Smith xarray, 1781d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1782d49cd2b7SBarry Smith #else 1783d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1784afb2bd1cSJunchao Zhang #endif 17859ae82921SPaul Mullowney 1786c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1787c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1788661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1789958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17909ae82921SPaul Mullowney PetscFunctionReturn(0); 17919ae82921SPaul Mullowney } 17929ae82921SPaul Mullowney 17937e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17947e8381f9SStefano Zampini { 17957e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17967e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17977e8381f9SStefano Zampini cudaError_t cerr; 17987e8381f9SStefano Zampini PetscErrorCode ierr; 17997e8381f9SStefano Zampini 18007e8381f9SStefano Zampini PetscFunctionBegin; 18017e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 18027e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 18037e8381f9SStefano Zampini 18047e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 18057e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 18067e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 18077e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 18087e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 18097e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 18107e8381f9SStefano Zampini } 18117e8381f9SStefano Zampini PetscFunctionReturn(0); 18127e8381f9SStefano Zampini } 18137e8381f9SStefano Zampini 18147e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 18157e8381f9SStefano Zampini { 18167e8381f9SStefano Zampini PetscErrorCode ierr; 18177e8381f9SStefano Zampini 18187e8381f9SStefano Zampini PetscFunctionBegin; 18197e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 182067a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 182167a45760SJunchao Zhang PetscFunctionReturn(0); 182267a45760SJunchao Zhang } 182367a45760SJunchao Zhang 182467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 182567a45760SJunchao Zhang { 182667a45760SJunchao Zhang PetscFunctionBegin; 18277e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 182867a45760SJunchao Zhang *array = NULL; 182967a45760SJunchao Zhang PetscFunctionReturn(0); 183067a45760SJunchao Zhang } 183167a45760SJunchao Zhang 183267a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 183367a45760SJunchao Zhang { 183467a45760SJunchao Zhang PetscErrorCode ierr; 183567a45760SJunchao Zhang 183667a45760SJunchao Zhang PetscFunctionBegin; 183767a45760SJunchao Zhang ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 183867a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 183967a45760SJunchao Zhang PetscFunctionReturn(0); 184067a45760SJunchao Zhang } 184167a45760SJunchao Zhang 184267a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 184367a45760SJunchao Zhang { 184467a45760SJunchao Zhang PetscFunctionBegin; 184567a45760SJunchao Zhang *array = NULL; 184667a45760SJunchao Zhang PetscFunctionReturn(0); 184767a45760SJunchao Zhang } 184867a45760SJunchao Zhang 184967a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 185067a45760SJunchao Zhang { 185167a45760SJunchao Zhang PetscFunctionBegin; 185267a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 185367a45760SJunchao Zhang PetscFunctionReturn(0); 185467a45760SJunchao Zhang } 185567a45760SJunchao Zhang 185667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 185767a45760SJunchao Zhang { 185867a45760SJunchao Zhang PetscFunctionBegin; 185967a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 186067a45760SJunchao Zhang *array = NULL; 18617e8381f9SStefano Zampini PetscFunctionReturn(0); 18627e8381f9SStefano Zampini } 18637e8381f9SStefano Zampini 1864042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 18659ae82921SPaul Mullowney { 1866aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 18677c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 18689ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1869213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 18709ae82921SPaul Mullowney PetscErrorCode ierr; 1871aa372e3fSPaul Mullowney cusparseStatus_t stat; 1872abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1873b06137fdSPaul Mullowney cudaError_t err; 18749ae82921SPaul Mullowney 18759ae82921SPaul Mullowney PetscFunctionBegin; 1876e8d2b73aSMark Adams if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1877c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1878a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1879a49f1ed0SStefano Zampini CsrMatrix *matrix; 1880afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 188185ba7357SStefano Zampini 1882e8d2b73aSMark Adams if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 188385ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1884afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 188505035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 18864863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 188785ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1888a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 188934d6c7a5SJose E. Roman } else { 1890abb89eb1SStefano Zampini PetscInt nnz; 189185ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 18927c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1893a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 18947c700b8dSJunchao Zhang delete cusparsestruct->workVector; 189581902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1896a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1897a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18989ae82921SPaul Mullowney try { 18999ae82921SPaul Mullowney if (a->compressedrow.use) { 19009ae82921SPaul Mullowney m = a->compressedrow.nrows; 19019ae82921SPaul Mullowney ii = a->compressedrow.i; 19029ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 19039ae82921SPaul Mullowney } else { 1904213423ffSJunchao Zhang m = A->rmap->n; 1905213423ffSJunchao Zhang ii = a->i; 1906e6e9a74fSStefano Zampini ridx = NULL; 19079ae82921SPaul Mullowney } 1908e8d2b73aSMark Adams if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1909e8d2b73aSMark Adams if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1910abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1911abb89eb1SStefano Zampini else nnz = a->nz; 19129ae82921SPaul Mullowney 191385ba7357SStefano Zampini /* create cusparse matrix */ 1914abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1915aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 191657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 191757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 191857d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 19199ae82921SPaul Mullowney 1920afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 19217656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 19227656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1923afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 19247656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 19257656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 192657d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1927b06137fdSPaul Mullowney 1928aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1929aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1930aa372e3fSPaul Mullowney /* set the matrix */ 1931afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1932afb2bd1cSJunchao Zhang mat->num_rows = m; 1933afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1934abb89eb1SStefano Zampini mat->num_entries = nnz; 1935afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1936afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 19379ae82921SPaul Mullowney 1938abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1939abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1940aa372e3fSPaul Mullowney 1941abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1942abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1943aa372e3fSPaul Mullowney 1944aa372e3fSPaul Mullowney /* assign the pointer */ 1945afb2bd1cSJunchao Zhang matstruct->mat = mat; 1946afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1947afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1948afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1949afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1950afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1951afb2bd1cSJunchao Zhang mat->values->data().get(), 1952afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1953afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1954afb2bd1cSJunchao Zhang } 1955afb2bd1cSJunchao Zhang #endif 1956aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1957afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1958afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1959afb2bd1cSJunchao Zhang #else 1960afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1961afb2bd1cSJunchao Zhang mat->num_rows = m; 1962afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1963abb89eb1SStefano Zampini mat->num_entries = nnz; 1964afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1965afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1966aa372e3fSPaul Mullowney 1967abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1968abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1969aa372e3fSPaul Mullowney 1970abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1971abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1972aa372e3fSPaul Mullowney 1973aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 197457d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1975aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1976aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1977afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1978afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1979afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1980afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 198157d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1982aa372e3fSPaul Mullowney /* assign the pointer */ 1983aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1984aa372e3fSPaul Mullowney 1985afb2bd1cSJunchao Zhang if (mat) { 1986afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1987afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1988afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1989afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1990087f3262SPaul Mullowney } 1991afb2bd1cSJunchao Zhang #endif 1992087f3262SPaul Mullowney } 1993ca45077fSPaul Mullowney 1994aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1995213423ffSJunchao Zhang if (a->compressedrow.use) { 1996213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1997aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1998aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1999213423ffSJunchao Zhang tmp = m; 2000213423ffSJunchao Zhang } else { 2001213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2002213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2003213423ffSJunchao Zhang tmp = 0; 2004213423ffSJunchao Zhang } 2005213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2006aa372e3fSPaul Mullowney 2007aa372e3fSPaul Mullowney /* assign the pointer */ 2008aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 20099ae82921SPaul Mullowney } catch(char *ex) { 201098921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 20119ae82921SPaul Mullowney } 201205035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 201385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 201434d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 201534d6c7a5SJose E. Roman } 2016abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 20179ae82921SPaul Mullowney } 20189ae82921SPaul Mullowney PetscFunctionReturn(0); 20199ae82921SPaul Mullowney } 20209ae82921SPaul Mullowney 2021c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 2022aa372e3fSPaul Mullowney { 2023aa372e3fSPaul Mullowney template <typename Tuple> 2024aa372e3fSPaul Mullowney __host__ __device__ 2025aa372e3fSPaul Mullowney void operator()(Tuple t) 2026aa372e3fSPaul Mullowney { 2027aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2028aa372e3fSPaul Mullowney } 2029aa372e3fSPaul Mullowney }; 2030aa372e3fSPaul Mullowney 20317e8381f9SStefano Zampini struct VecCUDAEquals 20327e8381f9SStefano Zampini { 20337e8381f9SStefano Zampini template <typename Tuple> 20347e8381f9SStefano Zampini __host__ __device__ 20357e8381f9SStefano Zampini void operator()(Tuple t) 20367e8381f9SStefano Zampini { 20377e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 20387e8381f9SStefano Zampini } 20397e8381f9SStefano Zampini }; 20407e8381f9SStefano Zampini 2041e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 2042e6e9a74fSStefano Zampini { 2043e6e9a74fSStefano Zampini template <typename Tuple> 2044e6e9a74fSStefano Zampini __host__ __device__ 2045e6e9a74fSStefano Zampini void operator()(Tuple t) 2046e6e9a74fSStefano Zampini { 2047e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2048e6e9a74fSStefano Zampini } 2049e6e9a74fSStefano Zampini }; 2050e6e9a74fSStefano Zampini 2051afb2bd1cSJunchao Zhang struct MatMatCusparse { 2052ccdfe979SStefano Zampini PetscBool cisdense; 2053ccdfe979SStefano Zampini PetscScalar *Bt; 2054ccdfe979SStefano Zampini Mat X; 2055fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2056fcdce8c4SStefano Zampini PetscLogDouble flops; 2057fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2058b4285af6SJunchao Zhang 2059afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2060fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2061afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2062afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2063afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2064afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2065b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2066b4285af6SJunchao Zhang void *dBuffer4; 2067b4285af6SJunchao Zhang void *dBuffer5; 2068b4285af6SJunchao Zhang #endif 2069fcdce8c4SStefano Zampini size_t mmBufferSize; 2070fcdce8c4SStefano Zampini void *mmBuffer; 2071fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2072fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2073afb2bd1cSJunchao Zhang #endif 2074afb2bd1cSJunchao Zhang }; 2075ccdfe979SStefano Zampini 2076ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2077ccdfe979SStefano Zampini { 2078ccdfe979SStefano Zampini PetscErrorCode ierr; 2079ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2080ccdfe979SStefano Zampini cudaError_t cerr; 2081fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2082fcdce8c4SStefano Zampini cusparseStatus_t stat; 2083fcdce8c4SStefano Zampini #endif 2084ccdfe979SStefano Zampini 2085ccdfe979SStefano Zampini PetscFunctionBegin; 2086ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2087fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2088afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2089fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2090afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2091afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2092fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2093b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2094b4285af6SJunchao Zhang if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2095b4285af6SJunchao Zhang if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2096b4285af6SJunchao Zhang #endif 2097b4285af6SJunchao Zhang if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2098b4285af6SJunchao Zhang if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2099afb2bd1cSJunchao Zhang #endif 2100ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2101ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 2102ccdfe979SStefano Zampini PetscFunctionReturn(0); 2103ccdfe979SStefano Zampini } 2104ccdfe979SStefano Zampini 2105ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2106ccdfe979SStefano Zampini 2107ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2108ccdfe979SStefano Zampini { 2109ccdfe979SStefano Zampini Mat_Product *product = C->product; 2110ccdfe979SStefano Zampini Mat A,B; 2111afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2112ccdfe979SStefano Zampini PetscBool flg,biscuda; 2113ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2114ccdfe979SStefano Zampini cusparseStatus_t stat; 2115ccdfe979SStefano Zampini cusparseOperation_t opA; 2116ccdfe979SStefano Zampini const PetscScalar *barray; 2117ccdfe979SStefano Zampini PetscScalar *carray; 2118ccdfe979SStefano Zampini PetscErrorCode ierr; 2119ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2120ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2121ccdfe979SStefano Zampini CsrMatrix *csrmat; 2122ccdfe979SStefano Zampini 2123ccdfe979SStefano Zampini PetscFunctionBegin; 2124ccdfe979SStefano Zampini MatCheckProduct(C,1); 2125e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2126ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2127ccdfe979SStefano Zampini A = product->A; 2128ccdfe979SStefano Zampini B = product->B; 2129ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 213098921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2131ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2132ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 2133ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2134ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2135ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2136ccdfe979SStefano Zampini switch (product->type) { 2137ccdfe979SStefano Zampini case MATPRODUCT_AB: 2138ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2139ccdfe979SStefano Zampini mat = cusp->mat; 2140ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2141ccdfe979SStefano Zampini m = A->rmap->n; 2142ccdfe979SStefano Zampini n = B->cmap->n; 2143ccdfe979SStefano Zampini break; 2144ccdfe979SStefano Zampini case MATPRODUCT_AtB: 21451a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2146e6e9a74fSStefano Zampini mat = cusp->mat; 2147e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2148e6e9a74fSStefano Zampini } else { 21493606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2150ccdfe979SStefano Zampini mat = cusp->matTranspose; 2151ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2152e6e9a74fSStefano Zampini } 2153ccdfe979SStefano Zampini m = A->cmap->n; 2154ccdfe979SStefano Zampini n = B->cmap->n; 2155ccdfe979SStefano Zampini break; 2156ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2157ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2158ccdfe979SStefano Zampini mat = cusp->mat; 2159ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2160ccdfe979SStefano Zampini m = A->rmap->n; 2161ccdfe979SStefano Zampini n = B->rmap->n; 2162ccdfe979SStefano Zampini break; 2163ccdfe979SStefano Zampini default: 216498921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2165ccdfe979SStefano Zampini } 2166e8d2b73aSMark Adams if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2167ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2168ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2169ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2170afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2171ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2172afb2bd1cSJunchao Zhang 2173ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2174c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2175c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2176c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2177c8378d12SStefano Zampini } else { 2178c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2179c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2180c8378d12SStefano Zampini } 2181c8378d12SStefano Zampini 2182c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2183afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2184afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2185a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2186afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2187fcdce8c4SStefano Zampini size_t mmBufferSize; 2188afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2189afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2190afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2191afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2192afb2bd1cSJunchao Zhang } 2193c8378d12SStefano Zampini 2194afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2195afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2196afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2197afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2198afb2bd1cSJunchao Zhang } 2199afb2bd1cSJunchao Zhang 2200afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2201afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2202afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2203afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2204afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2205afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2206afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2207afb2bd1cSJunchao Zhang } 2208afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2209afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2210afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2211fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2212fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2213ee7b52eaSHong Zhang cudaError_t cerr; 2214fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2215fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2216fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2217fcdce8c4SStefano Zampini } 2218afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2219afb2bd1cSJunchao Zhang } else { 2220afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2221afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2222afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2223afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2224afb2bd1cSJunchao Zhang } 2225afb2bd1cSJunchao Zhang 2226afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2227afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2228afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2229afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2230fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2231afb2bd1cSJunchao Zhang #else 2232afb2bd1cSJunchao Zhang PetscInt k; 2233afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2234ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2235ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2236ccdfe979SStefano Zampini cublasStatus_t cerr; 2237ccdfe979SStefano Zampini 2238ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2239ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2240ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2241ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2242ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2243ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2244ccdfe979SStefano Zampini blda = B->cmap->n; 2245afb2bd1cSJunchao Zhang k = B->cmap->n; 2246afb2bd1cSJunchao Zhang } else { 2247afb2bd1cSJunchao Zhang k = B->rmap->n; 2248ccdfe979SStefano Zampini } 2249ccdfe979SStefano Zampini 2250afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2251ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2252afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2253ccdfe979SStefano Zampini csrmat->values->data().get(), 2254ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2255ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2256ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2257ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2258afb2bd1cSJunchao Zhang #endif 2259c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2260c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2261ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2262ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2263ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2264ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2265ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2266ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2267ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2268ccdfe979SStefano Zampini } else { 2269ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2270ccdfe979SStefano Zampini } 2271ccdfe979SStefano Zampini if (mmdata->cisdense) { 2272ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2273ccdfe979SStefano Zampini } 2274ccdfe979SStefano Zampini if (!biscuda) { 2275ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2276ccdfe979SStefano Zampini } 2277ccdfe979SStefano Zampini PetscFunctionReturn(0); 2278ccdfe979SStefano Zampini } 2279ccdfe979SStefano Zampini 2280ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2281ccdfe979SStefano Zampini { 2282ccdfe979SStefano Zampini Mat_Product *product = C->product; 2283ccdfe979SStefano Zampini Mat A,B; 2284ccdfe979SStefano Zampini PetscInt m,n; 2285ccdfe979SStefano Zampini PetscBool cisdense,flg; 2286ccdfe979SStefano Zampini PetscErrorCode ierr; 2287ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2288ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2289ccdfe979SStefano Zampini 2290ccdfe979SStefano Zampini PetscFunctionBegin; 2291ccdfe979SStefano Zampini MatCheckProduct(C,1); 2292e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2293ccdfe979SStefano Zampini A = product->A; 2294ccdfe979SStefano Zampini B = product->B; 2295ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 229698921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2297ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2298e8d2b73aSMark Adams if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2299ccdfe979SStefano Zampini switch (product->type) { 2300ccdfe979SStefano Zampini case MATPRODUCT_AB: 2301ccdfe979SStefano Zampini m = A->rmap->n; 2302ccdfe979SStefano Zampini n = B->cmap->n; 2303ccdfe979SStefano Zampini break; 2304ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2305ccdfe979SStefano Zampini m = A->cmap->n; 2306ccdfe979SStefano Zampini n = B->cmap->n; 2307ccdfe979SStefano Zampini break; 2308ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2309ccdfe979SStefano Zampini m = A->rmap->n; 2310ccdfe979SStefano Zampini n = B->rmap->n; 2311ccdfe979SStefano Zampini break; 2312ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2313ccdfe979SStefano Zampini m = B->cmap->n; 2314ccdfe979SStefano Zampini n = B->cmap->n; 2315ccdfe979SStefano Zampini break; 2316ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2317ccdfe979SStefano Zampini m = B->rmap->n; 2318ccdfe979SStefano Zampini n = B->rmap->n; 2319ccdfe979SStefano Zampini break; 2320ccdfe979SStefano Zampini default: 232198921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2322ccdfe979SStefano Zampini } 2323ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2324ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2325ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2326ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2327ccdfe979SStefano Zampini 2328ccdfe979SStefano Zampini /* product data */ 2329ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2330ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2331afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2332afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2333ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2334afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2335ccdfe979SStefano Zampini } 2336afb2bd1cSJunchao Zhang #endif 2337ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2338ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2339ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2340ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2341ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2342ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2343ccdfe979SStefano Zampini } else { 2344ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2345ccdfe979SStefano Zampini } 2346ccdfe979SStefano Zampini } 2347ccdfe979SStefano Zampini C->product->data = mmdata; 2348ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2349ccdfe979SStefano Zampini 2350ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2351ccdfe979SStefano Zampini PetscFunctionReturn(0); 2352ccdfe979SStefano Zampini } 2353ccdfe979SStefano Zampini 2354fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2355ccdfe979SStefano Zampini { 2356ccdfe979SStefano Zampini Mat_Product *product = C->product; 2357fcdce8c4SStefano Zampini Mat A,B; 2358fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2359fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2360fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2361fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2362fcdce8c4SStefano Zampini PetscBool flg; 2363ccdfe979SStefano Zampini PetscErrorCode ierr; 2364fcdce8c4SStefano Zampini cusparseStatus_t stat; 2365fcdce8c4SStefano Zampini cudaError_t cerr; 2366fcdce8c4SStefano Zampini MatProductType ptype; 2367fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2368fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2369fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2370fcdce8c4SStefano Zampini #endif 2371b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2372ccdfe979SStefano Zampini 2373ccdfe979SStefano Zampini PetscFunctionBegin; 2374ccdfe979SStefano Zampini MatCheckProduct(C,1); 2375e8d2b73aSMark Adams if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2376fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 237798921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2378fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2379fcdce8c4SStefano Zampini A = product->A; 2380fcdce8c4SStefano Zampini B = product->B; 2381fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2382fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2383fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2384e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2385fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 238698921bdaSJacob Faibussowitsch if (!Cmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2387fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2388e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2389fcdce8c4SStefano Zampini goto finalize; 2390fcdce8c4SStefano Zampini } 2391fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2392fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239398921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2394fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239598921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2396fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2397fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2398fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2399fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2400fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2401e8d2b73aSMark Adams if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2402e8d2b73aSMark Adams if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2403e8d2b73aSMark Adams if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2404fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2405fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2406fcdce8c4SStefano Zampini 2407fcdce8c4SStefano Zampini ptype = product->type; 2408fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2409fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2410fa046f9fSJunchao Zhang if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2411fa046f9fSJunchao Zhang } 2412fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2413fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2414fa046f9fSJunchao Zhang if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2415fa046f9fSJunchao Zhang } 2416fcdce8c4SStefano Zampini switch (ptype) { 2417fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2418fcdce8c4SStefano Zampini Amat = Acusp->mat; 2419fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2420fcdce8c4SStefano Zampini break; 2421fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2422fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2423fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2424fcdce8c4SStefano Zampini break; 2425fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2426fcdce8c4SStefano Zampini Amat = Acusp->mat; 2427fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2428fcdce8c4SStefano Zampini break; 2429fcdce8c4SStefano Zampini default: 243098921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2431fcdce8c4SStefano Zampini } 2432fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 243398921bdaSJacob Faibussowitsch if (!Amat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 243498921bdaSJacob Faibussowitsch if (!Bmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 243598921bdaSJacob Faibussowitsch if (!Cmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2436fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2437fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2438fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2439e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2440e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2441e8d2b73aSMark Adams if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2442fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2443fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2444fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2445b4285af6SJunchao Zhang stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2446b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2447b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2448b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2449b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2450b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2451b4285af6SJunchao Zhang #else 2452b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2453fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2454fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2455fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2456b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2457fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2458fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2459b4285af6SJunchao Zhang #endif 2460fcdce8c4SStefano Zampini #else 2461b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2462fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2463fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2464fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2465fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2466fcdce8c4SStefano Zampini #endif 2467fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2468fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2469fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2470fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2471fcdce8c4SStefano Zampini finalize: 2472fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2473*7d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2474fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2475*7d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 2476fcdce8c4SStefano Zampini c->reallocs = 0; 2477fcdce8c4SStefano Zampini C->info.mallocs += 0; 2478fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2479fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2480fcdce8c4SStefano Zampini C->num_ass++; 2481ccdfe979SStefano Zampini PetscFunctionReturn(0); 2482ccdfe979SStefano Zampini } 2483fcdce8c4SStefano Zampini 2484fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2485fcdce8c4SStefano Zampini { 2486fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2487fcdce8c4SStefano Zampini Mat A,B; 2488fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2489fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2490fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2491fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2492fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2493fcdce8c4SStefano Zampini PetscBool flg; 2494fcdce8c4SStefano Zampini PetscErrorCode ierr; 2495fcdce8c4SStefano Zampini cusparseStatus_t stat; 2496fcdce8c4SStefano Zampini cudaError_t cerr; 2497fcdce8c4SStefano Zampini MatProductType ptype; 2498fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2499fcdce8c4SStefano Zampini PetscLogDouble flops; 2500fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2501fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2502fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2503fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2504fcdce8c4SStefano Zampini #else 2505fcdce8c4SStefano Zampini int cnz; 2506fcdce8c4SStefano Zampini #endif 2507b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2508fcdce8c4SStefano Zampini 2509fcdce8c4SStefano Zampini PetscFunctionBegin; 2510fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2511e8d2b73aSMark Adams if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2512fcdce8c4SStefano Zampini A = product->A; 2513fcdce8c4SStefano Zampini B = product->B; 2514fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 251598921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2516fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 251798921bdaSJacob Faibussowitsch if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2518fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2519fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2520fcdce8c4SStefano Zampini /* product data */ 2521fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2522fcdce8c4SStefano Zampini C->product->data = mmdata; 2523fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2524fcdce8c4SStefano Zampini 2525fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2526fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2527d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2528d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2529d60bce21SJunchao Zhang if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2530d60bce21SJunchao Zhang if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2531d60bce21SJunchao Zhang 2532fcdce8c4SStefano Zampini ptype = product->type; 2533fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2534fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2535fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2536fa046f9fSJunchao Zhang } 2537fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2538fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2539fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2540fa046f9fSJunchao Zhang } 2541fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2542fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2543fcdce8c4SStefano Zampini switch (ptype) { 2544fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2545fcdce8c4SStefano Zampini m = A->rmap->n; 2546fcdce8c4SStefano Zampini n = B->cmap->n; 2547fcdce8c4SStefano Zampini k = A->cmap->n; 2548fcdce8c4SStefano Zampini Amat = Acusp->mat; 2549fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2550fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2551fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2552fcdce8c4SStefano Zampini break; 2553fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2554fcdce8c4SStefano Zampini m = A->cmap->n; 2555fcdce8c4SStefano Zampini n = B->cmap->n; 2556fcdce8c4SStefano Zampini k = A->rmap->n; 25573606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2558fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2559fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2560fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2561fcdce8c4SStefano Zampini break; 2562fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2563fcdce8c4SStefano Zampini m = A->rmap->n; 2564fcdce8c4SStefano Zampini n = B->rmap->n; 2565fcdce8c4SStefano Zampini k = A->cmap->n; 25663606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2567fcdce8c4SStefano Zampini Amat = Acusp->mat; 2568fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2569fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2570fcdce8c4SStefano Zampini break; 2571fcdce8c4SStefano Zampini default: 257298921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2573fcdce8c4SStefano Zampini } 2574fcdce8c4SStefano Zampini 2575fcdce8c4SStefano Zampini /* create cusparse matrix */ 2576fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2577fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2578fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2579fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2580fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2581fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2582fcdce8c4SStefano Zampini 2583fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2584fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2585fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2586fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2587fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2588fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2589fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2590fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2591fcdce8c4SStefano Zampini } else { 2592fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2593fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2594fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2595fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2596fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2597fcdce8c4SStefano Zampini } 2598fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2599fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2600fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2601fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2602fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2603fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2604fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2605fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2606fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2607fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2608fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2609fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2610fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2611fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2612fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2613fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2614fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2615fcdce8c4SStefano Zampini c->nz = 0; 2616fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2617fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2618fcdce8c4SStefano Zampini goto finalizesym; 2619fcdce8c4SStefano Zampini } 2620fcdce8c4SStefano Zampini 262198921bdaSJacob Faibussowitsch if (!Amat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 262298921bdaSJacob Faibussowitsch if (!Bmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2623fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2624fcdce8c4SStefano Zampini if (!biscompressed) { 2625fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2626fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2627fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2628fcdce8c4SStefano Zampini #endif 2629fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2630fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2631fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2632fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2633fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2634fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2635fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2636fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2637fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2638fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2639fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2640fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2641fcdce8c4SStefano Zampini } 2642fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2643fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2644fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2645fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2646fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2647fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2648fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2649fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2650fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2651fcdce8c4SStefano Zampini } 2652fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2653fcdce8c4SStefano Zampini #endif 2654fcdce8c4SStefano Zampini } 2655e8d2b73aSMark Adams if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2656e8d2b73aSMark Adams if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2657fcdce8c4SStefano Zampini /* precompute flops count */ 2658fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2659fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2660fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2661fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2662fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2663fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2664fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2665fcdce8c4SStefano Zampini } 2666fcdce8c4SStefano Zampini } 2667fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2668fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2669fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2670fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2671fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2672fcdce8c4SStefano Zampini } 2673fcdce8c4SStefano Zampini } else { /* TODO */ 2674fcdce8c4SStefano Zampini flops = 0.; 2675fcdce8c4SStefano Zampini } 2676fcdce8c4SStefano Zampini 2677fcdce8c4SStefano Zampini mmdata->flops = flops; 2678fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2679b4285af6SJunchao Zhang 2680fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2681fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2682fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2683fcdce8c4SStefano Zampini NULL, NULL, NULL, 2684fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2685fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2686fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2687b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2688b4285af6SJunchao Zhang { 2689b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2690b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2691b4285af6SJunchao Zhang */ 2692b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2693b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2694b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2695b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2696b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2697b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2698b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2699b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2700b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2701b4285af6SJunchao Zhang 2702b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2703b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2704b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2705b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2706b4285af6SJunchao Zhang &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2707b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2708b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2709b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2710b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2711b4285af6SJunchao Zhang &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2712b4285af6SJunchao Zhang 2713b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2714b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2715b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2716b4285af6SJunchao Zhang &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2717b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2718b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2719b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2720b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2721b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2722b4285af6SJunchao Zhang &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2723b4285af6SJunchao Zhang cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2724b4285af6SJunchao Zhang cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2725b4285af6SJunchao Zhang 2726b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2727b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 2728b4285af6SJunchao Zhang stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2729b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2730b4285af6SJunchao Zhang /* allocate matrix C */ 2731b4285af6SJunchao Zhang Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2732b4285af6SJunchao Zhang Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2733b4285af6SJunchao Zhang /* update matC with the new pointers */ 2734b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2735b4285af6SJunchao Zhang Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2736b4285af6SJunchao Zhang 2737b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2738b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2739b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2740b4285af6SJunchao Zhang &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2741b4285af6SJunchao Zhang cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2742b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2743b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2744b4285af6SJunchao Zhang &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2745b4285af6SJunchao Zhang cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2746b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2747b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2748b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2749b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2750*7d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2751b4285af6SJunchao Zhang } 2752ae37ee31SJunchao Zhang #else 2753b4285af6SJunchao Zhang size_t bufSize2; 2754fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2755b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2756fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2757fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2758fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2759bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2760fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2761b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2762fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2763fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2764fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2765fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2766b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2767fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2768fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2769fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2770fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2771fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2772fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2773fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2774fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2775bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2776fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2777b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2778fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2779fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2780fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2781fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2782fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2783fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 2784*7d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2785fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2786fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2787fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2788fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2789fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2790fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2791b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2792fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2793fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2794ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2795fcdce8c4SStefano Zampini #else 2796fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2797b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2798fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2799fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2800fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2801fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2802fcdce8c4SStefano Zampini c->nz = cnz; 2803fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2804fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2805fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2806fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2807fcdce8c4SStefano Zampini 2808fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2809fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2810fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2811fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2812b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2813fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2814fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2815fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2816fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2817fcdce8c4SStefano Zampini #endif 2818fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2819fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2820fcdce8c4SStefano Zampini finalizesym: 2821fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2822fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2823fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2824fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2825fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2826fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2827fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2828fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2829fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2830fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2831fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2832fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2833fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2834fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2835fcdce8c4SStefano Zampini } else { 2836fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2837fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2838fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2840fcdce8c4SStefano Zampini } 2841fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2842fcdce8c4SStefano Zampini PetscInt r = 0; 2843fcdce8c4SStefano Zampini c->i[0] = 0; 2844fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2845fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2846fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2847fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2848fcdce8c4SStefano Zampini } 2849fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2850fcdce8c4SStefano Zampini } 2851fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2852fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2853fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2854fcdce8c4SStefano Zampini c->maxnz = c->nz; 2855fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2856fcdce8c4SStefano Zampini c->rmax = 0; 2857fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2858fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2859fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2860fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2861fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2862fcdce8c4SStefano Zampini } 2863fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2864fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2865fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2866fcdce8c4SStefano Zampini 2867fcdce8c4SStefano Zampini C->nonzerostate++; 2868fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2869fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2870fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2871fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2872fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2873fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2874fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2875abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2876fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2877fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2878fcdce8c4SStefano Zampini } 2879fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2880fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2881fcdce8c4SStefano Zampini } 2882fcdce8c4SStefano Zampini 2883fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2884fcdce8c4SStefano Zampini 2885fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2886fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2887fcdce8c4SStefano Zampini { 2888fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2889fcdce8c4SStefano Zampini PetscErrorCode ierr; 2890fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2891fcdce8c4SStefano Zampini 2892fcdce8c4SStefano Zampini PetscFunctionBegin; 2893fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2894fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2895abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2896fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2897fcdce8c4SStefano Zampini } 2898fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2899fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2900fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2901fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2902fcdce8c4SStefano Zampini } 2903fcdce8c4SStefano Zampini } 290465e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 290565e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 290665e4b4d4SStefano Zampini switch (product->type) { 290765e4b4d4SStefano Zampini case MATPRODUCT_AB: 290865e4b4d4SStefano Zampini if (product->api_user) { 290965e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 291065e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 291165e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 291265e4b4d4SStefano Zampini } else { 291365e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 29143e662e0bSHong Zhang ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 291565e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 291665e4b4d4SStefano Zampini } 291765e4b4d4SStefano Zampini break; 291865e4b4d4SStefano Zampini case MATPRODUCT_AtB: 291965e4b4d4SStefano Zampini if (product->api_user) { 292065e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 292165e4b4d4SStefano Zampini ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 292265e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 292365e4b4d4SStefano Zampini } else { 292465e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 29253e662e0bSHong Zhang ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 292665e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 292765e4b4d4SStefano Zampini } 292865e4b4d4SStefano Zampini break; 292965e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 293065e4b4d4SStefano Zampini if (product->api_user) { 293165e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 293265e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 293365e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 293465e4b4d4SStefano Zampini } else { 293565e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 29363e662e0bSHong Zhang ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 293765e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 293865e4b4d4SStefano Zampini } 293965e4b4d4SStefano Zampini break; 294065e4b4d4SStefano Zampini case MATPRODUCT_RARt: 294165e4b4d4SStefano Zampini if (product->api_user) { 294265e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 294365e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 294465e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 294565e4b4d4SStefano Zampini } else { 294665e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 29473e662e0bSHong Zhang ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 294865e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 294965e4b4d4SStefano Zampini } 295065e4b4d4SStefano Zampini break; 295165e4b4d4SStefano Zampini case MATPRODUCT_ABC: 295265e4b4d4SStefano Zampini if (product->api_user) { 295365e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 295465e4b4d4SStefano Zampini ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 295565e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 295665e4b4d4SStefano Zampini } else { 295765e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 29583e662e0bSHong Zhang ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 295965e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 296065e4b4d4SStefano Zampini } 296165e4b4d4SStefano Zampini break; 296265e4b4d4SStefano Zampini default: 296365e4b4d4SStefano Zampini break; 296465e4b4d4SStefano Zampini } 296565e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 296665e4b4d4SStefano Zampini } 296765e4b4d4SStefano Zampini /* dispatch */ 2968fcdce8c4SStefano Zampini if (isdense) { 2969ccdfe979SStefano Zampini switch (product->type) { 2970ccdfe979SStefano Zampini case MATPRODUCT_AB: 2971ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2972ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2973ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2974ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2975fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2976fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2977fcdce8c4SStefano Zampini } else { 2978fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2979fcdce8c4SStefano Zampini } 2980fcdce8c4SStefano Zampini break; 2981fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2982fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2983fcdce8c4SStefano Zampini break; 2984ccdfe979SStefano Zampini default: 2985ccdfe979SStefano Zampini break; 2986ccdfe979SStefano Zampini } 2987fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2988fcdce8c4SStefano Zampini switch (product->type) { 2989fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2990fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2991fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2992fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2993fcdce8c4SStefano Zampini break; 2994fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2995fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2996fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2997fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2998fcdce8c4SStefano Zampini break; 2999fcdce8c4SStefano Zampini default: 3000fcdce8c4SStefano Zampini break; 3001fcdce8c4SStefano Zampini } 3002fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 3003fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3004fcdce8c4SStefano Zampini } 3005ccdfe979SStefano Zampini PetscFunctionReturn(0); 3006ccdfe979SStefano Zampini } 3007ccdfe979SStefano Zampini 30086fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 30099ae82921SPaul Mullowney { 3010b175d8bbSPaul Mullowney PetscErrorCode ierr; 30119ae82921SPaul Mullowney 30129ae82921SPaul Mullowney PetscFunctionBegin; 3013e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3014e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3015e6e9a74fSStefano Zampini } 3016e6e9a74fSStefano Zampini 3017e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3018e6e9a74fSStefano Zampini { 3019e6e9a74fSStefano Zampini PetscErrorCode ierr; 3020e6e9a74fSStefano Zampini 3021e6e9a74fSStefano Zampini PetscFunctionBegin; 3022e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3023e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3024e6e9a74fSStefano Zampini } 3025e6e9a74fSStefano Zampini 3026e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3027e6e9a74fSStefano Zampini { 3028e6e9a74fSStefano Zampini PetscErrorCode ierr; 3029e6e9a74fSStefano Zampini 3030e6e9a74fSStefano Zampini PetscFunctionBegin; 3031e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3032e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3033e6e9a74fSStefano Zampini } 3034e6e9a74fSStefano Zampini 3035e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3036e6e9a74fSStefano Zampini { 3037e6e9a74fSStefano Zampini PetscErrorCode ierr; 3038e6e9a74fSStefano Zampini 3039e6e9a74fSStefano Zampini PetscFunctionBegin; 3040e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 30419ae82921SPaul Mullowney PetscFunctionReturn(0); 30429ae82921SPaul Mullowney } 30439ae82921SPaul Mullowney 30446fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3045ca45077fSPaul Mullowney { 3046b175d8bbSPaul Mullowney PetscErrorCode ierr; 3047ca45077fSPaul Mullowney 3048ca45077fSPaul Mullowney PetscFunctionBegin; 3049e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3050ca45077fSPaul Mullowney PetscFunctionReturn(0); 3051ca45077fSPaul Mullowney } 3052ca45077fSPaul Mullowney 3053a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3054a0e72f99SJunchao Zhang { 3055a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 3056a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3057a0e72f99SJunchao Zhang } 3058a0e72f99SJunchao Zhang 3059afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3060e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 30619ae82921SPaul Mullowney { 30629ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3063aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 30649ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3065e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3066b175d8bbSPaul Mullowney PetscErrorCode ierr; 3067aa372e3fSPaul Mullowney cusparseStatus_t stat; 3068e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3069e6e9a74fSStefano Zampini PetscBool compressed; 3070afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3071afb2bd1cSJunchao Zhang PetscInt nx,ny; 3072afb2bd1cSJunchao Zhang #endif 30736e111a19SKarl Rupp 30749ae82921SPaul Mullowney PetscFunctionBegin; 3075e8d2b73aSMark Adams if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3076e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 3077afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3078d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3079e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3080e6e9a74fSStefano Zampini } 308134d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 308234d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3083e6e9a74fSStefano Zampini if (!trans) { 30849ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3085e8d2b73aSMark Adams if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3086e6e9a74fSStefano Zampini } else { 30871a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3088e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3089e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3090e6e9a74fSStefano Zampini } else { 30913606e59fSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3092e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3093e6e9a74fSStefano Zampini } 3094e6e9a74fSStefano Zampini } 3095e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3096e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3097213423ffSJunchao Zhang 3098e6e9a74fSStefano Zampini try { 3099e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3100213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3101213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3102afb2bd1cSJunchao Zhang 310385ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3104e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3105afb2bd1cSJunchao Zhang /* z = A x + beta y. 3106afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3107afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3108afb2bd1cSJunchao Zhang */ 3109e6e9a74fSStefano Zampini xptr = xarray; 3110afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3111213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3112afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3113afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3114afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3115afb2bd1cSJunchao Zhang */ 3116afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3117afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3118afb2bd1cSJunchao Zhang nx = mat->num_cols; 3119afb2bd1cSJunchao Zhang ny = mat->num_rows; 3120afb2bd1cSJunchao Zhang } 3121afb2bd1cSJunchao Zhang #endif 3122e6e9a74fSStefano Zampini } else { 3123afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3124afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3125afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3126afb2bd1cSJunchao Zhang */ 3127afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3128e6e9a74fSStefano Zampini dptr = zarray; 3129e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3130afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3131e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3132a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3133e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3134e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3135e6e9a74fSStefano Zampini } 3136afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3137afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3138afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3139afb2bd1cSJunchao Zhang nx = mat->num_rows; 3140afb2bd1cSJunchao Zhang ny = mat->num_cols; 3141afb2bd1cSJunchao Zhang } 3142afb2bd1cSJunchao Zhang #endif 3143e6e9a74fSStefano Zampini } 31449ae82921SPaul Mullowney 3145afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3146aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3147afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3148afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3149afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3150ee7b52eaSHong Zhang cudaError_t cerr; 3151afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3152afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3153afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3154afb2bd1cSJunchao Zhang matstruct->matDescr, 3155afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3156afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3157afb2bd1cSJunchao Zhang cusparse_scalartype, 3158afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3159afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3160afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3161afb2bd1cSJunchao Zhang 3162afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3163afb2bd1cSJunchao Zhang } else { 3164afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3165afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3166afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3167afb2bd1cSJunchao Zhang } 3168afb2bd1cSJunchao Zhang 3169afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 3170afb2bd1cSJunchao Zhang matstruct->alpha_one, 31713606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3172afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3173afb2bd1cSJunchao Zhang beta, 3174afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3175afb2bd1cSJunchao Zhang cusparse_scalartype, 3176afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 3177afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3178afb2bd1cSJunchao Zhang #else 31797656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3180e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3181a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3182afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3183aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3184e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 318557d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3186afb2bd1cSJunchao Zhang #endif 3187aa372e3fSPaul Mullowney } else { 3188213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3189afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3190afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3191afb2bd1cSJunchao Zhang #else 3192301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3193e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3194afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3195e6e9a74fSStefano Zampini xptr, beta, 319657d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 3197afb2bd1cSJunchao Zhang #endif 3198a65300a6SPaul Mullowney } 3199aa372e3fSPaul Mullowney } 3200958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3201aa372e3fSPaul Mullowney 3202e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3203213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3204213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3205213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3206e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3207213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 32087656d835SStefano Zampini } 3209213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3210c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 32117656d835SStefano Zampini } 32127656d835SStefano Zampini 3213213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3214213423ffSJunchao Zhang if (compressed) { 3215e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3216a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3217a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3218a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3219a0e72f99SJunchao Zhang */ 3220a0e72f99SJunchao Zhang #if 0 3221a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3222a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3223a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3224e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3225c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3226a0e72f99SJunchao Zhang #else 3227a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3228a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3229a0e72f99SJunchao Zhang #endif 3230958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3231e6e9a74fSStefano Zampini } 3232e6e9a74fSStefano Zampini } else { 3233e6e9a74fSStefano Zampini if (yy && yy != zz) { 3234e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3235e6e9a74fSStefano Zampini } 3236e6e9a74fSStefano Zampini } 3237e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3238213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3239213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 32409ae82921SPaul Mullowney } catch(char *ex) { 324198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 32429ae82921SPaul Mullowney } 3243e6e9a74fSStefano Zampini if (yy) { 3244958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3245e6e9a74fSStefano Zampini } else { 3246e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3247e6e9a74fSStefano Zampini } 32489ae82921SPaul Mullowney PetscFunctionReturn(0); 32499ae82921SPaul Mullowney } 32509ae82921SPaul Mullowney 32516fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3252ca45077fSPaul Mullowney { 3253b175d8bbSPaul Mullowney PetscErrorCode ierr; 32546e111a19SKarl Rupp 3255ca45077fSPaul Mullowney PetscFunctionBegin; 3256e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3257ca45077fSPaul Mullowney PetscFunctionReturn(0); 3258ca45077fSPaul Mullowney } 3259ca45077fSPaul Mullowney 32606fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 32619ae82921SPaul Mullowney { 32629ae82921SPaul Mullowney PetscErrorCode ierr; 3263042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3264042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 32653fa6b06aSMark Adams 3266042217e8SBarry Smith PetscFunctionBegin; 3267042217e8SBarry Smith ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3268042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3269042217e8SBarry Smith cudaError_t cerr; 3270042217e8SBarry Smith 3271042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3272042217e8SBarry Smith cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3273042217e8SBarry Smith cusp->deviceMat = NULL; 3274042217e8SBarry Smith } 32759ae82921SPaul Mullowney PetscFunctionReturn(0); 32769ae82921SPaul Mullowney } 32779ae82921SPaul Mullowney 32789ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3279e057df02SPaul Mullowney /*@ 32809ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3281e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3282e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3283e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3284e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3285e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32869ae82921SPaul Mullowney 3287d083f849SBarry Smith Collective 32889ae82921SPaul Mullowney 32899ae82921SPaul Mullowney Input Parameters: 32909ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32919ae82921SPaul Mullowney . m - number of rows 32929ae82921SPaul Mullowney . n - number of columns 32939ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32949ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32950298fd71SBarry Smith (possibly different for each row) or NULL 32969ae82921SPaul Mullowney 32979ae82921SPaul Mullowney Output Parameter: 32989ae82921SPaul Mullowney . A - the matrix 32999ae82921SPaul Mullowney 33009ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 33019ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 33029ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 33039ae82921SPaul Mullowney 33049ae82921SPaul Mullowney Notes: 33059ae82921SPaul Mullowney If nnz is given then nz is ignored 33069ae82921SPaul Mullowney 33079ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 33089ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 33099ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 33109ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 33119ae82921SPaul Mullowney 33129ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 33130298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 33149ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 33159ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 33169ae82921SPaul Mullowney 33179ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 33189ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 33199ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 33209ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 33219ae82921SPaul Mullowney 33229ae82921SPaul Mullowney Level: intermediate 33239ae82921SPaul Mullowney 3324e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 33259ae82921SPaul Mullowney @*/ 33269ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 33279ae82921SPaul Mullowney { 33289ae82921SPaul Mullowney PetscErrorCode ierr; 33299ae82921SPaul Mullowney 33309ae82921SPaul Mullowney PetscFunctionBegin; 33319ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 33329ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 33339ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 33349ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 33359ae82921SPaul Mullowney PetscFunctionReturn(0); 33369ae82921SPaul Mullowney } 33379ae82921SPaul Mullowney 33386fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 33399ae82921SPaul Mullowney { 33409ae82921SPaul Mullowney PetscErrorCode ierr; 3341ab25e6cbSDominic Meiser 33429ae82921SPaul Mullowney PetscFunctionBegin; 33439ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3344470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 33459ae82921SPaul Mullowney } else { 3346470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3347aa372e3fSPaul Mullowney } 3348c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3349ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3350365b711fSMark Adams ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3351ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3352ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3353fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3354ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 33557e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 33567e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3357ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 33589ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 33599ae82921SPaul Mullowney PetscFunctionReturn(0); 33609ae82921SPaul Mullowney } 33619ae82921SPaul Mullowney 3362ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 336395639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 33649ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 33659ff858a8SKarl Rupp { 33669ff858a8SKarl Rupp PetscErrorCode ierr; 33679ff858a8SKarl Rupp 33689ff858a8SKarl Rupp PetscFunctionBegin; 33699ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3370ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 33719ff858a8SKarl Rupp PetscFunctionReturn(0); 33729ff858a8SKarl Rupp } 33739ff858a8SKarl Rupp 3374039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 337595639643SRichard Tran Mills { 3376e6e9a74fSStefano Zampini PetscErrorCode ierr; 3377a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3378039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3379039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3380039c6fbaSStefano Zampini PetscScalar *ay; 3381039c6fbaSStefano Zampini const PetscScalar *ax; 3382039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3383e6e9a74fSStefano Zampini 338495639643SRichard Tran Mills PetscFunctionBegin; 3385a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3386a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3387039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3388a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3389a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3390a587d139SMark PetscFunctionReturn(0); 339195639643SRichard Tran Mills } 3392039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3393a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3394a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3395e8d2b73aSMark Adams if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3396e8d2b73aSMark Adams if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3397039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3398039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3399039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3400039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3401039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3402039c6fbaSStefano Zampini if (eq) { 3403039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3404039c6fbaSStefano Zampini } 3405039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3406039c6fbaSStefano Zampini } 3407d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3408d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3409039c6fbaSStefano Zampini 3410039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3411039c6fbaSStefano Zampini cusparseStatus_t stat; 3412039c6fbaSStefano Zampini PetscScalar b = 1.0; 3413039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3414039c6fbaSStefano Zampini size_t bufferSize; 3415039c6fbaSStefano Zampini void *buffer; 3416ee7b52eaSHong Zhang cudaError_t cerr; 3417039c6fbaSStefano Zampini #endif 3418039c6fbaSStefano Zampini 3419039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3420039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3421039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3422039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3423039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3424039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3425039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3426039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3427039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3428039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3429039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3430039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3431039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3432039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3433039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3434039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3435039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3436039c6fbaSStefano Zampini #else 3437039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3438039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3439039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3440039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3441039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3442039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3443039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3444039c6fbaSStefano Zampini #endif 3445039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3446039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3447039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3448039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3449039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3450a587d139SMark cublasHandle_t cublasv2handle; 3451039c6fbaSStefano Zampini cublasStatus_t berr; 3452a587d139SMark PetscBLASInt one = 1, bnz = 1; 3453039c6fbaSStefano Zampini 3454039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3455039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3456a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3457a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3458a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3459039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3460a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3461a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3462039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3463039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3464a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3465039c6fbaSStefano Zampini } else { 3466a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3467d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3468a587d139SMark } 346995639643SRichard Tran Mills PetscFunctionReturn(0); 347095639643SRichard Tran Mills } 347195639643SRichard Tran Mills 347233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 347333c9ba73SStefano Zampini { 347433c9ba73SStefano Zampini PetscErrorCode ierr; 347533c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 347633c9ba73SStefano Zampini PetscScalar *ay; 347733c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 347833c9ba73SStefano Zampini cublasStatus_t berr; 347933c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 348033c9ba73SStefano Zampini 348133c9ba73SStefano Zampini PetscFunctionBegin; 348233c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 348333c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 348433c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 348533c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 348633c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 348733c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 348833c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 348933c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 349033c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 349133c9ba73SStefano Zampini PetscFunctionReturn(0); 349233c9ba73SStefano Zampini } 349333c9ba73SStefano Zampini 34943fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34953fa6b06aSMark Adams { 34963fa6b06aSMark Adams PetscErrorCode ierr; 34977e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3498a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34997e8381f9SStefano Zampini 35003fa6b06aSMark Adams PetscFunctionBegin; 35013fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 35023fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 35037e8381f9SStefano Zampini if (spptr->mat) { 35047e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 35057e8381f9SStefano Zampini if (matrix->values) { 35067e8381f9SStefano Zampini both = PETSC_TRUE; 35077e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 35087e8381f9SStefano Zampini } 35097e8381f9SStefano Zampini } 35107e8381f9SStefano Zampini if (spptr->matTranspose) { 35117e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 35127e8381f9SStefano Zampini if (matrix->values) { 35137e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 35147e8381f9SStefano Zampini } 35157e8381f9SStefano Zampini } 35163fa6b06aSMark Adams } 3517a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3518a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3519a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 35207e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3521a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 35223fa6b06aSMark Adams PetscFunctionReturn(0); 35233fa6b06aSMark Adams } 35243fa6b06aSMark Adams 3525a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3526a587d139SMark { 3527a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3528a587d139SMark PetscErrorCode ierr; 3529a587d139SMark 3530a587d139SMark PetscFunctionBegin; 35319a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 35329a14fc28SStefano Zampini A->boundtocpu = flg; 35339a14fc28SStefano Zampini PetscFunctionReturn(0); 35349a14fc28SStefano Zampini } 3535a587d139SMark if (flg) { 3536a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3537a587d139SMark 353833c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3539a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3540a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3541a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3542a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3543a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3544a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3545a587d139SMark A->ops->multhermitiantranspose = NULL; 3546a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3547fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 354867a45760SJunchao Zhang ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3549c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3550a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3551a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3552a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3553a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3554a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3555fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3556a587d139SMark } else { 355733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3558a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3559a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3560a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3561a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3562a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3563a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3564a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3565a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3566fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 356767a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 356867a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 356967a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 357067a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 357167a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 357267a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3573c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3574a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579a587d139SMark } 3580a587d139SMark A->boundtocpu = flg; 3581ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3582ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3583ea500dcfSRichard Tran Mills } else { 3584ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3585ea500dcfSRichard Tran Mills } 3586a587d139SMark PetscFunctionReturn(0); 3587a587d139SMark } 3588a587d139SMark 358949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 35909ae82921SPaul Mullowney { 35919ae82921SPaul Mullowney PetscErrorCode ierr; 3592aa372e3fSPaul Mullowney cusparseStatus_t stat; 359349735bf3SStefano Zampini Mat B; 35949ae82921SPaul Mullowney 35959ae82921SPaul Mullowney PetscFunctionBegin; 3596a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 359749735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 359849735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 359949735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 360049735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 360149735bf3SStefano Zampini } 360249735bf3SStefano Zampini B = *newmat; 360349735bf3SStefano Zampini 360434136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 360534136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 360634136279SStefano Zampini 360749735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 36089ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3609e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3610e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3611e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3612a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 36131a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3614d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3615a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3616a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3617a435da06SStefano Zampini #else 3618d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3619a435da06SStefano Zampini #endif 3620d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3621d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3622d8132acaSStefano Zampini #endif 36231a2c6b5cSJunchao Zhang B->spptr = spptr; 36249ae82921SPaul Mullowney } else { 3625e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3626e6e9a74fSStefano Zampini 3627e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3628e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3629a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3630e6e9a74fSStefano Zampini B->spptr = spptr; 36319ae82921SPaul Mullowney } 3632e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 363349735bf3SStefano Zampini } 3634693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 36359ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 36361a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 36379ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 363895639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3639693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 36402205254eSKarl Rupp 3641e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 36429ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3643bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3644ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 3645ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3646ae48a8d0SStefano Zampini #endif 3647365b711fSMark Adams ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 36489ae82921SPaul Mullowney PetscFunctionReturn(0); 36499ae82921SPaul Mullowney } 36509ae82921SPaul Mullowney 365102fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 365202fe1965SBarry Smith { 365302fe1965SBarry Smith PetscErrorCode ierr; 365402fe1965SBarry Smith 365502fe1965SBarry Smith PetscFunctionBegin; 365602fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 36570ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 365802fe1965SBarry Smith PetscFunctionReturn(0); 365902fe1965SBarry Smith } 366002fe1965SBarry Smith 36613ca39a21SBarry Smith /*MC 3662e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3663e057df02SPaul Mullowney 3664e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 36652692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 36662692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3667e057df02SPaul Mullowney 3668e057df02SPaul Mullowney Options Database Keys: 3669e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3670aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3671a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3672365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3673e057df02SPaul Mullowney 3674e057df02SPaul Mullowney Level: beginner 3675e057df02SPaul Mullowney 36768468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3677e057df02SPaul Mullowney M*/ 36787f756511SDominic Meiser 3679bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 36800f39cd5aSBarry Smith 36813ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 368242c9c57cSBarry Smith { 368342c9c57cSBarry Smith PetscErrorCode ierr; 368442c9c57cSBarry Smith 368542c9c57cSBarry Smith PetscFunctionBegin; 3686bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 36873ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 36883ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 36893ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 36903ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3691bddcd29dSMark Adams 369242c9c57cSBarry Smith PetscFunctionReturn(0); 369342c9c57cSBarry Smith } 369429b38603SBarry Smith 3695470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 36967f756511SDominic Meiser { 3697e6e9a74fSStefano Zampini PetscErrorCode ierr; 36987f756511SDominic Meiser cusparseStatus_t stat; 36997f756511SDominic Meiser 37007f756511SDominic Meiser PetscFunctionBegin; 37017f756511SDominic Meiser if (*cusparsestruct) { 3702e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3703e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 37047f756511SDominic Meiser delete (*cusparsestruct)->workVector; 370581902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 37067e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 37077e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3708a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 37097e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3710e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 37117f756511SDominic Meiser } 37127f756511SDominic Meiser PetscFunctionReturn(0); 37137f756511SDominic Meiser } 37147f756511SDominic Meiser 37157f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 37167f756511SDominic Meiser { 37177f756511SDominic Meiser PetscFunctionBegin; 37187f756511SDominic Meiser if (*mat) { 37197f756511SDominic Meiser delete (*mat)->values; 37207f756511SDominic Meiser delete (*mat)->column_indices; 37217f756511SDominic Meiser delete (*mat)->row_offsets; 37227f756511SDominic Meiser delete *mat; 37237f756511SDominic Meiser *mat = 0; 37247f756511SDominic Meiser } 37257f756511SDominic Meiser PetscFunctionReturn(0); 37267f756511SDominic Meiser } 37277f756511SDominic Meiser 3728470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 37297f756511SDominic Meiser { 37307f756511SDominic Meiser cusparseStatus_t stat; 37317f756511SDominic Meiser PetscErrorCode ierr; 37327f756511SDominic Meiser 37337f756511SDominic Meiser PetscFunctionBegin; 37347f756511SDominic Meiser if (*trifactor) { 373557d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3736afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 37377f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 37381b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 37392cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3740afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 37411b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3742afb2bd1cSJunchao Zhang #endif 3743da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 37447f756511SDominic Meiser } 37457f756511SDominic Meiser PetscFunctionReturn(0); 37467f756511SDominic Meiser } 37477f756511SDominic Meiser 3748470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 37497f756511SDominic Meiser { 37507f756511SDominic Meiser CsrMatrix *mat; 37517f756511SDominic Meiser cusparseStatus_t stat; 37527f756511SDominic Meiser cudaError_t err; 37537f756511SDominic Meiser 37547f756511SDominic Meiser PetscFunctionBegin; 37557f756511SDominic Meiser if (*matstruct) { 37567f756511SDominic Meiser if ((*matstruct)->mat) { 37577f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3758afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3759afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3760afb2bd1cSJunchao Zhang #else 37617f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 376257d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3763afb2bd1cSJunchao Zhang #endif 37647f756511SDominic Meiser } else { 37657f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 37667f756511SDominic Meiser CsrMatrix_Destroy(&mat); 37677f756511SDominic Meiser } 37687f756511SDominic Meiser } 376957d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 37707f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3771afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 37727656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 37737656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3774afb2bd1cSJunchao Zhang 3775afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3776afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3777afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3778afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3779afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3780afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3781afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3782afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3783afb2bd1cSJunchao Zhang } 3784afb2bd1cSJunchao Zhang } 3785afb2bd1cSJunchao Zhang #endif 37867f756511SDominic Meiser delete *matstruct; 37877e8381f9SStefano Zampini *matstruct = NULL; 37887f756511SDominic Meiser } 37897f756511SDominic Meiser PetscFunctionReturn(0); 37907f756511SDominic Meiser } 37917f756511SDominic Meiser 3792e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 37937f756511SDominic Meiser { 3794e6e9a74fSStefano Zampini PetscErrorCode ierr; 3795e6e9a74fSStefano Zampini 37967f756511SDominic Meiser PetscFunctionBegin; 37977f756511SDominic Meiser if (*trifactors) { 3798e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3799e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3800e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3801e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 38027f756511SDominic Meiser delete (*trifactors)->rpermIndices; 38037f756511SDominic Meiser delete (*trifactors)->cpermIndices; 38047f756511SDominic Meiser delete (*trifactors)->workVector; 38057e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 38067e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 38077e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3808bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3809bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3810e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3811ccdfe979SStefano Zampini } 3812ccdfe979SStefano Zampini PetscFunctionReturn(0); 3813ccdfe979SStefano Zampini } 3814ccdfe979SStefano Zampini 3815ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3816ccdfe979SStefano Zampini { 3817e6e9a74fSStefano Zampini PetscErrorCode ierr; 3818ccdfe979SStefano Zampini cusparseHandle_t handle; 3819ccdfe979SStefano Zampini cusparseStatus_t stat; 3820ccdfe979SStefano Zampini 3821ccdfe979SStefano Zampini PetscFunctionBegin; 3822ccdfe979SStefano Zampini if (*trifactors) { 3823e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 38247f756511SDominic Meiser if (handle = (*trifactors)->handle) { 382557d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 38267f756511SDominic Meiser } 3827e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 38287f756511SDominic Meiser } 38297f756511SDominic Meiser PetscFunctionReturn(0); 38307f756511SDominic Meiser } 38317e8381f9SStefano Zampini 38327e8381f9SStefano Zampini struct IJCompare 38337e8381f9SStefano Zampini { 38347e8381f9SStefano Zampini __host__ __device__ 38357e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 38367e8381f9SStefano Zampini { 38377e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 38387e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 38397e8381f9SStefano Zampini return false; 38407e8381f9SStefano Zampini } 38417e8381f9SStefano Zampini }; 38427e8381f9SStefano Zampini 38437e8381f9SStefano Zampini struct IJEqual 38447e8381f9SStefano Zampini { 38457e8381f9SStefano Zampini __host__ __device__ 38467e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 38477e8381f9SStefano Zampini { 38487e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 38497e8381f9SStefano Zampini return true; 38507e8381f9SStefano Zampini } 38517e8381f9SStefano Zampini }; 38527e8381f9SStefano Zampini 38537e8381f9SStefano Zampini struct IJDiff 38547e8381f9SStefano Zampini { 38557e8381f9SStefano Zampini __host__ __device__ 38567e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 38577e8381f9SStefano Zampini { 38587e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 38597e8381f9SStefano Zampini } 38607e8381f9SStefano Zampini }; 38617e8381f9SStefano Zampini 38627e8381f9SStefano Zampini struct IJSum 38637e8381f9SStefano Zampini { 38647e8381f9SStefano Zampini __host__ __device__ 38657e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 38667e8381f9SStefano Zampini { 38677e8381f9SStefano Zampini return t1||t2; 38687e8381f9SStefano Zampini } 38697e8381f9SStefano Zampini }; 38707e8381f9SStefano Zampini 38717e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3872e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 38737e8381f9SStefano Zampini { 38747e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3875fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3876bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 387708391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 38787e8381f9SStefano Zampini CsrMatrix *matrix; 38797e8381f9SStefano Zampini PetscErrorCode ierr; 38807e8381f9SStefano Zampini PetscInt n; 38817e8381f9SStefano Zampini 38827e8381f9SStefano Zampini PetscFunctionBegin; 38837e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 38847e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 38857e8381f9SStefano Zampini if (!cusp->cooPerm) { 38867e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 38877e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 38887e8381f9SStefano Zampini PetscFunctionReturn(0); 38897e8381f9SStefano Zampini } 38907e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 38917e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3892e61fc153SStefano Zampini if (!v) { 3893e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3894e61fc153SStefano Zampini goto finalize; 38957e8381f9SStefano Zampini } 3896e61fc153SStefano Zampini n = cusp->cooPerm->size(); 389708391a17SStefano Zampini if (isCudaMem(v)) { 389808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 389908391a17SStefano Zampini } else { 3900e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3901e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 390208391a17SStefano Zampini d_v = cooPerm_v->data(); 3903e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 390408391a17SStefano Zampini } 3905bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3906e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3907ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3908bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 390908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3910ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3911ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3912ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3913ddea5d60SJunchao Zhang */ 3914e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3915e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3916e61fc153SStefano Zampini delete cooPerm_w; 39177e8381f9SStefano Zampini } else { 3918ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 391908391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 39207e8381f9SStefano Zampini matrix->values->begin())); 392108391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 39227e8381f9SStefano Zampini matrix->values->end())); 3923ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 39247e8381f9SStefano Zampini } 39257e8381f9SStefano Zampini } else { 3926e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 392708391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3928e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 39297e8381f9SStefano Zampini } else { 393008391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 39317e8381f9SStefano Zampini matrix->values->begin())); 393208391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 39337e8381f9SStefano Zampini matrix->values->end())); 39347e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 39357e8381f9SStefano Zampini } 39367e8381f9SStefano Zampini } 3937bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3938e61fc153SStefano Zampini finalize: 3939e61fc153SStefano Zampini delete cooPerm_v; 39407e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3941e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3942fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3943*7d3de750SJacob Faibussowitsch ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3944fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3945*7d3de750SJacob Faibussowitsch ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr); 3946fcdce8c4SStefano Zampini a->reallocs = 0; 3947fcdce8c4SStefano Zampini A->info.mallocs += 0; 3948fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3949fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3950fcdce8c4SStefano Zampini A->num_ass++; 39517e8381f9SStefano Zampini PetscFunctionReturn(0); 39527e8381f9SStefano Zampini } 39537e8381f9SStefano Zampini 3954a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3955a49f1ed0SStefano Zampini { 3956a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3957a49f1ed0SStefano Zampini PetscErrorCode ierr; 3958a49f1ed0SStefano Zampini 3959a49f1ed0SStefano Zampini PetscFunctionBegin; 3960a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3961a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3962a49f1ed0SStefano Zampini if (destroy) { 3963a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3964a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3965a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3966a49f1ed0SStefano Zampini } 39671a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3968a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3969a49f1ed0SStefano Zampini } 3970a49f1ed0SStefano Zampini 39717e8381f9SStefano Zampini #include <thrust/binary_search.h> 397282a78a4eSJed Brown PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 39737e8381f9SStefano Zampini { 39747e8381f9SStefano Zampini PetscErrorCode ierr; 39757e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 39767e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 39777e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 39787e8381f9SStefano Zampini cudaError_t cerr; 39797e8381f9SStefano Zampini 39807e8381f9SStefano Zampini PetscFunctionBegin; 39817e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 39827e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 39837e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 39847e8381f9SStefano Zampini if (n != cooPerm_n) { 39857e8381f9SStefano Zampini delete cusp->cooPerm; 39867e8381f9SStefano Zampini delete cusp->cooPerm_a; 39877e8381f9SStefano Zampini cusp->cooPerm = NULL; 39887e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 39897e8381f9SStefano Zampini } 39907e8381f9SStefano Zampini if (n) { 39917e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 39927e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 39937e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 39947e8381f9SStefano Zampini 39957e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 39967e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 39977e8381f9SStefano Zampini 39987e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 39997e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 40007e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 4001ddea5d60SJunchao Zhang 4002ddea5d60SJunchao Zhang /* Ex. 4003ddea5d60SJunchao Zhang n = 6 4004ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4005ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4006ddea5d60SJunchao Zhang */ 40077e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 40087e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 40097e8381f9SStefano Zampini 401008391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 40117e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4012ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4013ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 40147e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 40157e8381f9SStefano Zampini 4016ddea5d60SJunchao Zhang /* 4017ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4018ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4019ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4020ddea5d60SJunchao Zhang */ 4021ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4022ddea5d60SJunchao Zhang 4023ddea5d60SJunchao Zhang /* 4024ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4025ddea5d60SJunchao Zhang ^ekey 4026ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4027ddea5d60SJunchao Zhang ^nekye 4028ddea5d60SJunchao Zhang */ 40297e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 40307e8381f9SStefano Zampini delete cusp->cooPerm_a; 40317e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4032ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4033ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4034ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4035ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4036ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 40377e8381f9SStefano Zampini w[0] = 0; 4038ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4039ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 40407e8381f9SStefano Zampini } 40417e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4042ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4043ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4044ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 404508391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 40467e8381f9SStefano Zampini 40477e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 40487e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 40497e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 40507e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 40517e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4052ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 40537e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 40547e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4055fcdce8c4SStefano Zampini a->rmax = 0; 40567e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 40577e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 40587e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 40597e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 40607e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 40617e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 40627e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 40637e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 40647e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4065fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 40667e8381f9SStefano Zampini } 4067fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 40687e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 40697e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4070fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 40717e8381f9SStefano Zampini } else { 40727e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 40737e8381f9SStefano Zampini } 4074e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 40757e8381f9SStefano Zampini 40767e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4077e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 4078e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 40797e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 40807e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 40817e8381f9SStefano Zampini A->nonzerostate++; 40827e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4083a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 40847e8381f9SStefano Zampini 40857e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 40867e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 40877e8381f9SStefano Zampini PetscFunctionReturn(0); 40887e8381f9SStefano Zampini } 4089ed502f03SStefano Zampini 40905b7e41feSStefano Zampini /*@C 40915b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 40925b7e41feSStefano Zampini 40935b7e41feSStefano Zampini Not collective 40945b7e41feSStefano Zampini 40955b7e41feSStefano Zampini Input Parameters: 40965b7e41feSStefano Zampini + A - the matrix 40975b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40985b7e41feSStefano Zampini 40995b7e41feSStefano Zampini Output Parameters: 41005b7e41feSStefano Zampini + ia - the CSR row pointers 41015b7e41feSStefano Zampini - ja - the CSR column indices 41025b7e41feSStefano Zampini 41035b7e41feSStefano Zampini Level: developer 41045b7e41feSStefano Zampini 41055b7e41feSStefano Zampini Notes: 41065b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 41075b7e41feSStefano Zampini 41085b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 41095b7e41feSStefano Zampini @*/ 41105f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41115f101d05SStefano Zampini { 41125f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 41135f101d05SStefano Zampini CsrMatrix *csr; 41145f101d05SStefano Zampini PetscErrorCode ierr; 41155f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41165f101d05SStefano Zampini 41175f101d05SStefano Zampini PetscFunctionBegin; 41185f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41195f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 41205f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41215f101d05SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41225f101d05SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 41235f101d05SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 41245f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 41255f101d05SStefano Zampini if (i) { 41265f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 41275f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 41285f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 41295f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 41305f101d05SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 41315f101d05SStefano Zampini } 41325f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 41335f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 41345f101d05SStefano Zampini } 41355f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 41365f101d05SStefano Zampini PetscFunctionReturn(0); 41375f101d05SStefano Zampini } 41385f101d05SStefano Zampini 41395b7e41feSStefano Zampini /*@C 41405b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 41415b7e41feSStefano Zampini 41425b7e41feSStefano Zampini Not collective 41435b7e41feSStefano Zampini 41445b7e41feSStefano Zampini Input Parameters: 41455b7e41feSStefano Zampini + A - the matrix 41465b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 41475b7e41feSStefano Zampini 41485b7e41feSStefano Zampini Output Parameters: 41495b7e41feSStefano Zampini + ia - the CSR row pointers 41505b7e41feSStefano Zampini - ja - the CSR column indices 41515b7e41feSStefano Zampini 41525b7e41feSStefano Zampini Level: developer 41535b7e41feSStefano Zampini 41545b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 41555b7e41feSStefano Zampini @*/ 41565f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41575f101d05SStefano Zampini { 41585f101d05SStefano Zampini PetscFunctionBegin; 41595f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41605f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41615f101d05SStefano Zampini if (i) *i = NULL; 41625f101d05SStefano Zampini if (j) *j = NULL; 41635f101d05SStefano Zampini PetscFunctionReturn(0); 41645f101d05SStefano Zampini } 41655f101d05SStefano Zampini 41665b7e41feSStefano Zampini /*@C 41675b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41685b7e41feSStefano Zampini 41695b7e41feSStefano Zampini Not Collective 41705b7e41feSStefano Zampini 41715b7e41feSStefano Zampini Input Parameter: 41725b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41735b7e41feSStefano Zampini 41745b7e41feSStefano Zampini Output Parameter: 41755b7e41feSStefano Zampini . a - pointer to the device data 41765b7e41feSStefano Zampini 41775b7e41feSStefano Zampini Level: developer 41785b7e41feSStefano Zampini 41795b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41805b7e41feSStefano Zampini 41815b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 41825b7e41feSStefano Zampini @*/ 4183ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4184ed502f03SStefano Zampini { 4185ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4186ed502f03SStefano Zampini CsrMatrix *csr; 4187ed502f03SStefano Zampini PetscErrorCode ierr; 4188ed502f03SStefano Zampini 4189ed502f03SStefano Zampini PetscFunctionBegin; 4190ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4191ed502f03SStefano Zampini PetscValidPointer(a,2); 4192ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4193ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4194ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 419533c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4196ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4197ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4198ed502f03SStefano Zampini *a = csr->values->data().get(); 4199ed502f03SStefano Zampini PetscFunctionReturn(0); 4200ed502f03SStefano Zampini } 4201ed502f03SStefano Zampini 42025b7e41feSStefano Zampini /*@C 42035b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 42045b7e41feSStefano Zampini 42055b7e41feSStefano Zampini Not Collective 42065b7e41feSStefano Zampini 42075b7e41feSStefano Zampini Input Parameter: 42085b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42095b7e41feSStefano Zampini 42105b7e41feSStefano Zampini Output Parameter: 42115b7e41feSStefano Zampini . a - pointer to the device data 42125b7e41feSStefano Zampini 42135b7e41feSStefano Zampini Level: developer 42145b7e41feSStefano Zampini 42155b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 42165b7e41feSStefano Zampini @*/ 4217ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4218ed502f03SStefano Zampini { 4219ed502f03SStefano Zampini PetscFunctionBegin; 4220ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4221ed502f03SStefano Zampini PetscValidPointer(a,2); 4222ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4223ed502f03SStefano Zampini *a = NULL; 4224ed502f03SStefano Zampini PetscFunctionReturn(0); 4225ed502f03SStefano Zampini } 4226ed502f03SStefano Zampini 42275b7e41feSStefano Zampini /*@C 42285b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42295b7e41feSStefano Zampini 42305b7e41feSStefano Zampini Not Collective 42315b7e41feSStefano Zampini 42325b7e41feSStefano Zampini Input Parameter: 42335b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42345b7e41feSStefano Zampini 42355b7e41feSStefano Zampini Output Parameter: 42365b7e41feSStefano Zampini . a - pointer to the device data 42375b7e41feSStefano Zampini 42385b7e41feSStefano Zampini Level: developer 42395b7e41feSStefano Zampini 42405b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 42415b7e41feSStefano Zampini 42425b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 42435b7e41feSStefano Zampini @*/ 4244039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4245039c6fbaSStefano Zampini { 4246039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4247039c6fbaSStefano Zampini CsrMatrix *csr; 4248039c6fbaSStefano Zampini PetscErrorCode ierr; 4249039c6fbaSStefano Zampini 4250039c6fbaSStefano Zampini PetscFunctionBegin; 4251039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4252039c6fbaSStefano Zampini PetscValidPointer(a,2); 4253039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4254039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4255039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 425633c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4257039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4258039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4259039c6fbaSStefano Zampini *a = csr->values->data().get(); 4260039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4261a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4262039c6fbaSStefano Zampini PetscFunctionReturn(0); 4263039c6fbaSStefano Zampini } 42645b7e41feSStefano Zampini /*@C 42655b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4266039c6fbaSStefano Zampini 42675b7e41feSStefano Zampini Not Collective 42685b7e41feSStefano Zampini 42695b7e41feSStefano Zampini Input Parameter: 42705b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42715b7e41feSStefano Zampini 42725b7e41feSStefano Zampini Output Parameter: 42735b7e41feSStefano Zampini . a - pointer to the device data 42745b7e41feSStefano Zampini 42755b7e41feSStefano Zampini Level: developer 42765b7e41feSStefano Zampini 42775b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 42785b7e41feSStefano Zampini @*/ 4279039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4280039c6fbaSStefano Zampini { 4281039c6fbaSStefano Zampini PetscErrorCode ierr; 4282039c6fbaSStefano Zampini 4283039c6fbaSStefano Zampini PetscFunctionBegin; 4284039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4285039c6fbaSStefano Zampini PetscValidPointer(a,2); 4286039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4287039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4288039c6fbaSStefano Zampini *a = NULL; 4289039c6fbaSStefano Zampini PetscFunctionReturn(0); 4290039c6fbaSStefano Zampini } 4291039c6fbaSStefano Zampini 42925b7e41feSStefano Zampini /*@C 42935b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42945b7e41feSStefano Zampini 42955b7e41feSStefano Zampini Not Collective 42965b7e41feSStefano Zampini 42975b7e41feSStefano Zampini Input Parameter: 42985b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42995b7e41feSStefano Zampini 43005b7e41feSStefano Zampini Output Parameter: 43015b7e41feSStefano Zampini . a - pointer to the device data 43025b7e41feSStefano Zampini 43035b7e41feSStefano Zampini Level: developer 43045b7e41feSStefano Zampini 43055b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 43065b7e41feSStefano Zampini 43075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 43085b7e41feSStefano Zampini @*/ 4309ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4310ed502f03SStefano Zampini { 4311ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4312ed502f03SStefano Zampini CsrMatrix *csr; 4313a49f1ed0SStefano Zampini PetscErrorCode ierr; 4314ed502f03SStefano Zampini 4315ed502f03SStefano Zampini PetscFunctionBegin; 4316ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4317ed502f03SStefano Zampini PetscValidPointer(a,2); 4318ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4319ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 432033c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4321ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4322ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4323ed502f03SStefano Zampini *a = csr->values->data().get(); 4324039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 4325a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4326ed502f03SStefano Zampini PetscFunctionReturn(0); 4327ed502f03SStefano Zampini } 4328ed502f03SStefano Zampini 43295b7e41feSStefano Zampini /*@C 43305b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 43315b7e41feSStefano Zampini 43325b7e41feSStefano Zampini Not Collective 43335b7e41feSStefano Zampini 43345b7e41feSStefano Zampini Input Parameter: 43355b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43365b7e41feSStefano Zampini 43375b7e41feSStefano Zampini Output Parameter: 43385b7e41feSStefano Zampini . a - pointer to the device data 43395b7e41feSStefano Zampini 43405b7e41feSStefano Zampini Level: developer 43415b7e41feSStefano Zampini 43425b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 43435b7e41feSStefano Zampini @*/ 4344ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4345ed502f03SStefano Zampini { 4346ed502f03SStefano Zampini PetscErrorCode ierr; 4347ed502f03SStefano Zampini 4348ed502f03SStefano Zampini PetscFunctionBegin; 4349ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4350ed502f03SStefano Zampini PetscValidPointer(a,2); 4351ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4352ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4353ed502f03SStefano Zampini *a = NULL; 4354ed502f03SStefano Zampini PetscFunctionReturn(0); 4355ed502f03SStefano Zampini } 4356ed502f03SStefano Zampini 4357ed502f03SStefano Zampini struct IJCompare4 4358ed502f03SStefano Zampini { 4359ed502f03SStefano Zampini __host__ __device__ 43602ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4361ed502f03SStefano Zampini { 4362ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4363ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4364ed502f03SStefano Zampini return false; 4365ed502f03SStefano Zampini } 4366ed502f03SStefano Zampini }; 4367ed502f03SStefano Zampini 43688909a122SStefano Zampini struct Shift 43698909a122SStefano Zampini { 4370ed502f03SStefano Zampini int _shift; 4371ed502f03SStefano Zampini 4372ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4373ed502f03SStefano Zampini __host__ __device__ 4374ed502f03SStefano Zampini inline int operator() (const int &c) 4375ed502f03SStefano Zampini { 4376ed502f03SStefano Zampini return c + _shift; 4377ed502f03SStefano Zampini } 4378ed502f03SStefano Zampini }; 4379ed502f03SStefano Zampini 4380ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4381ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4382ed502f03SStefano Zampini { 4383ed502f03SStefano Zampini PetscErrorCode ierr; 4384ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4385ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4386ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4387ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4388ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4389ed502f03SStefano Zampini cusparseStatus_t stat; 4390ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4391ed502f03SStefano Zampini cudaError_t cerr; 4392ed502f03SStefano Zampini 4393ed502f03SStefano Zampini PetscFunctionBegin; 4394ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4395ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4396ed502f03SStefano Zampini PetscValidPointer(C,4); 4397ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4398ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 439998921bdaSJacob Faibussowitsch if (A->rmap->n != B->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4400ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4401ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4402ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4403ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4404ed502f03SStefano Zampini m = A->rmap->n; 4405ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 4406ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4407ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4408ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4409ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4410ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4411ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4412ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4413ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4414ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4415ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4416ed502f03SStefano Zampini c->compressedrow.i = NULL; 4417ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4418ed502f03SStefano Zampini Ccusp->workVector = NULL; 4419ed502f03SStefano Zampini Ccusp->nrows = m; 4420ed502f03SStefano Zampini Ccusp->mat = Cmat; 4421ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4422ed502f03SStefano Zampini Ccsr->num_rows = m; 4423ed502f03SStefano Zampini Ccsr->num_cols = n; 4424ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4425ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4426ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4427ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4428ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4429ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4430ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4431ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4432ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4433ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4434ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4435ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4436ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4437ed502f03SStefano Zampini 4438ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4439ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4440ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4441ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4442ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4443ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4444ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4445ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4446ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4447ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4448ed502f03SStefano Zampini if (c->nz) { 44492ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 44502ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 44512ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 44522ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 44532ed87e7eSStefano Zampini 4454ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4455ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4456ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4457ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4458ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4459ed502f03SStefano Zampini } 44602ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 44612ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4462ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4463ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4464ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4465ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4466ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4467ed502f03SStefano Zampini } 44682ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 44692ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 4470ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 44712ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 44722ed87e7eSStefano Zampini Aroff->data().get(), 44732ed87e7eSStefano Zampini Annz, 44742ed87e7eSStefano Zampini m, 44752ed87e7eSStefano Zampini Acoo->data().get(), 44762ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4477ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 44782ed87e7eSStefano Zampini Broff->data().get(), 4479ed502f03SStefano Zampini Bnnz, 4480ed502f03SStefano Zampini m, 44812ed87e7eSStefano Zampini Bcoo->data().get(), 4482ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 44832ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 44842ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 44852ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 44868909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4487ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4488ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 44898909a122SStefano Zampini #else 44908909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 44918909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 44928909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 44938909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 44948909a122SStefano Zampini #endif 44952ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 44962ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 44972ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 44982ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 44992ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 45002ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4501ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4502ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4503ed502f03SStefano Zampini thrust::advance(p2,Annz); 45042ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 45058909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 45068909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 45078909a122SStefano Zampini #endif 45082ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 45092ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 45102ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 45112ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 45122ed87e7eSStefano Zampini #else 45132ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 45142ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 45152ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 45162ed87e7eSStefano Zampini #endif 4517ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 45182ed87e7eSStefano Zampini Ccoo->data().get(), 4519ed502f03SStefano Zampini c->nz, 4520ed502f03SStefano Zampini m, 4521ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4522ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4523ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 45242ed87e7eSStefano Zampini delete wPerm; 45252ed87e7eSStefano Zampini delete Acoo; 45262ed87e7eSStefano Zampini delete Bcoo; 45272ed87e7eSStefano Zampini delete Ccoo; 4528ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4529ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4530ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4531ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4532ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4533ed502f03SStefano Zampini #endif 45341a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 45353606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 45363606e59fSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4537ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4538ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4539ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4540ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4541ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4542ed502f03SStefano Zampini 45431a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 45441a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4545a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4546ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4547ed502f03SStefano Zampini CmatT->mat = CcsrT; 4548ed502f03SStefano Zampini CcsrT->num_rows = n; 4549ed502f03SStefano Zampini CcsrT->num_cols = m; 4550ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4551ed502f03SStefano Zampini 4552ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4553ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4554ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4555ed502f03SStefano Zampini 4556ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4557ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4558ed502f03SStefano Zampini if (AT) { 4559ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4560ed502f03SStefano Zampini thrust::advance(rT,-1); 4561ed502f03SStefano Zampini } 4562ed502f03SStefano Zampini if (BT) { 4563ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4564ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4565ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4566ed502f03SStefano Zampini } 4567ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4568ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4569ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4570ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4571ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4572ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4573ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4574ed502f03SStefano Zampini 4575ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4576ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4577ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4578ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4579ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4580ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4581ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4582ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4583ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4584ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4585ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4586ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4587ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4588ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4589ed502f03SStefano Zampini #endif 4590ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4591ed502f03SStefano Zampini } 4592ed502f03SStefano Zampini } 4593ed502f03SStefano Zampini 4594ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4595ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4596ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4597ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4598ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4599ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4600ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4601ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4602ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4603ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4604ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4605ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4606ed502f03SStefano Zampini } else { 4607ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4608ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4609ed502f03SStefano Zampini } 4610ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4611ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4612ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4613ed502f03SStefano Zampini c->maxnz = c->nz; 4614ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4615ed502f03SStefano Zampini c->rmax = 0; 4616ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4617ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4618ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4619ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4620ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4621ed502f03SStefano Zampini } 4622ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4623ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4624ed502f03SStefano Zampini (*C)->nonzerostate++; 4625ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4626ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4627ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4628ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4629ed502f03SStefano Zampini } else { 463098921bdaSJacob Faibussowitsch if ((*C)->rmap->n != B->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4631ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4632ed502f03SStefano Zampini if (c->nz) { 4633ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4634ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4635ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4636ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4637ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4638ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4639ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4640ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4641ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4642ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4643ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 464498921bdaSJacob Faibussowitsch if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 464598921bdaSJacob Faibussowitsch if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 464698921bdaSJacob Faibussowitsch if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 464798921bdaSJacob Faibussowitsch if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 464898921bdaSJacob Faibussowitsch if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4649ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4650ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4651ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4652ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4653ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4654ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4655ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4656ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4657ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4658ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4659ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4660ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4661ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4662a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 46631a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4664ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4665ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4666ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4667ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4668ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4669ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4670ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4671ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 46721a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4673ed502f03SStefano Zampini } 4674ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4675ed502f03SStefano Zampini } 4676ed502f03SStefano Zampini } 4677ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4678ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4679ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4680ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4681ed502f03SStefano Zampini PetscFunctionReturn(0); 4682ed502f03SStefano Zampini } 4683c215019aSStefano Zampini 4684c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4685c215019aSStefano Zampini { 4686c215019aSStefano Zampini PetscErrorCode ierr; 4687c215019aSStefano Zampini bool dmem; 4688c215019aSStefano Zampini const PetscScalar *av; 4689c215019aSStefano Zampini cudaError_t cerr; 4690c215019aSStefano Zampini 4691c215019aSStefano Zampini PetscFunctionBegin; 4692c215019aSStefano Zampini dmem = isCudaMem(v); 4693c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4694c215019aSStefano Zampini if (n && idx) { 4695c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4696c215019aSStefano Zampini widx.assign(idx,idx+n); 4697c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4698c215019aSStefano Zampini 4699c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4700c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4701c215019aSStefano Zampini if (dmem) { 4702c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4703c215019aSStefano Zampini } else { 4704c215019aSStefano Zampini w = new THRUSTARRAY(n); 4705c215019aSStefano Zampini dv = w->data(); 4706c215019aSStefano Zampini } 4707c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4708c215019aSStefano Zampini 4709c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4710c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4711c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4712c215019aSStefano Zampini if (w) { 4713c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4714c215019aSStefano Zampini } 4715c215019aSStefano Zampini delete w; 4716c215019aSStefano Zampini } else { 4717c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4718c215019aSStefano Zampini } 4719c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4720c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4721c215019aSStefano Zampini PetscFunctionReturn(0); 4722c215019aSStefano Zampini } 4723