19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94c215019aSStefano Zampini 95b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 96b06137fdSPaul Mullowney { 97b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98b06137fdSPaul Mullowney 99b06137fdSPaul Mullowney PetscFunctionBegin; 100*28b400f6SJacob Faibussowitsch PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101b06137fdSPaul Mullowney cusparsestruct->stream = stream; 1025f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream)); 103b06137fdSPaul Mullowney PetscFunctionReturn(0); 104b06137fdSPaul Mullowney } 105b06137fdSPaul Mullowney 106b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 107b06137fdSPaul Mullowney { 108b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109b06137fdSPaul Mullowney 110b06137fdSPaul Mullowney PetscFunctionBegin; 111*28b400f6SJacob Faibussowitsch PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1126b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11316a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 1145f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDestroy(cusparsestruct->handle)); 11516a2e217SAlejandro Lamas Daviña } 116b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1176b1cf21dSAlejandro Lamas Daviña } 1185f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 119b06137fdSPaul Mullowney PetscFunctionReturn(0); 120b06137fdSPaul Mullowney } 121b06137fdSPaul Mullowney 122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123b06137fdSPaul Mullowney { 124b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1257e8381f9SStefano Zampini PetscBool flg; 126ccdfe979SStefano Zampini 127b06137fdSPaul Mullowney PetscFunctionBegin; 1285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1297e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 131b06137fdSPaul Mullowney PetscFunctionReturn(0); 132b06137fdSPaul Mullowney } 133b06137fdSPaul Mullowney 134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1359ae82921SPaul Mullowney { 1369ae82921SPaul Mullowney PetscFunctionBegin; 1379ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1389ae82921SPaul Mullowney PetscFunctionReturn(0); 1399ae82921SPaul Mullowney } 1409ae82921SPaul Mullowney 141c708e6cdSJed Brown /*MC 142087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 148c708e6cdSJed Brown 1499ae82921SPaul Mullowney Level: beginner 150c708e6cdSJed Brown 1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152c708e6cdSJed Brown M*/ 1539ae82921SPaul Mullowney 15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1559ae82921SPaul Mullowney { 156bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1579ae82921SPaul Mullowney 1589ae82921SPaul Mullowney PetscFunctionBegin; 1595f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B)); 1605f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*B,n,n,n,n)); 1612c7c0729SBarry Smith (*B)->factortype = ftype; 1625f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*B,MATSEQAIJCUSPARSE)); 1632205254eSKarl Rupp 1645f80ce2aSJacob Faibussowitsch if (A->boundtocpu && A->bindingpropagates) CHKERRQ(MatBindToCPU(*B,PETSC_TRUE)); 165087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1665f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(*B,A,A)); 1679c1083e7SRichard Tran Mills if (!A->boundtocpu) { 1689ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1699ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1709c1083e7SRichard Tran Mills } else { 1719c1083e7SRichard Tran Mills (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1729c1083e7SRichard Tran Mills (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1739c1083e7SRichard Tran Mills } 1745f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 177087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1789c1083e7SRichard Tran Mills if (!A->boundtocpu) { 179087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 180087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1819c1083e7SRichard Tran Mills } else { 1829c1083e7SRichard Tran Mills (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1839c1083e7SRichard Tran Mills (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1849c1083e7SRichard Tran Mills } 1855f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1865f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1879ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 188bc3f50f2SPaul Mullowney 1895f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1904ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1915f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 1929ae82921SPaul Mullowney PetscFunctionReturn(0); 1939ae82921SPaul Mullowney } 1949ae82921SPaul Mullowney 195bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 196ca45077fSPaul Mullowney { 197aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1986e111a19SKarl Rupp 199ca45077fSPaul Mullowney PetscFunctionBegin; 200ca45077fSPaul Mullowney switch (op) { 201e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 202aa372e3fSPaul Mullowney cusparsestruct->format = format; 203ca45077fSPaul Mullowney break; 204e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 205aa372e3fSPaul Mullowney cusparsestruct->format = format; 206ca45077fSPaul Mullowney break; 207ca45077fSPaul Mullowney default: 20898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 209ca45077fSPaul Mullowney } 210ca45077fSPaul Mullowney PetscFunctionReturn(0); 211ca45077fSPaul Mullowney } 2129ae82921SPaul Mullowney 213e057df02SPaul Mullowney /*@ 214e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 215e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 216aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 217e057df02SPaul Mullowney Not Collective 218e057df02SPaul Mullowney 219e057df02SPaul Mullowney Input Parameters: 2208468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 22136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2222692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 223e057df02SPaul Mullowney 224e057df02SPaul Mullowney Output Parameter: 225e057df02SPaul Mullowney 226e057df02SPaul Mullowney Level: intermediate 227e057df02SPaul Mullowney 2288468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 229e057df02SPaul Mullowney @*/ 230e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 231e057df02SPaul Mullowney { 232e057df02SPaul Mullowney PetscFunctionBegin; 233e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 2345f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format))); 235e057df02SPaul Mullowney PetscFunctionReturn(0); 236e057df02SPaul Mullowney } 237e057df02SPaul Mullowney 238365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 239365b711fSMark Adams { 240365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 241365b711fSMark Adams 242365b711fSMark Adams PetscFunctionBegin; 243365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 244365b711fSMark Adams PetscFunctionReturn(0); 245365b711fSMark Adams } 246365b711fSMark Adams 247365b711fSMark Adams /*@ 248365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 249365b711fSMark Adams 250365b711fSMark Adams Input Parameters: 251365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 252365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 253365b711fSMark Adams 254365b711fSMark Adams Output Parameter: 255365b711fSMark Adams 256365b711fSMark Adams Notes: 257365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 258365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 259365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 260365b711fSMark Adams 261365b711fSMark Adams Level: intermediate 262365b711fSMark Adams 263365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 264365b711fSMark Adams @*/ 265365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 266365b711fSMark Adams { 267365b711fSMark Adams PetscFunctionBegin; 268365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 2695f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu))); 270365b711fSMark Adams PetscFunctionReturn(0); 271365b711fSMark Adams } 272365b711fSMark Adams 2731a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 274e6e9a74fSStefano Zampini { 275e6e9a74fSStefano Zampini PetscFunctionBegin; 2761a2c6b5cSJunchao Zhang switch (op) { 2771a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2781a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2795f80ce2aSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2801a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2811a2c6b5cSJunchao Zhang break; 2821a2c6b5cSJunchao Zhang default: 2835f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetOption_SeqAIJ(A,op,flg)); 2841a2c6b5cSJunchao Zhang break; 285e6e9a74fSStefano Zampini } 286e6e9a74fSStefano Zampini PetscFunctionReturn(0); 287e6e9a74fSStefano Zampini } 288e6e9a74fSStefano Zampini 289bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 290bddcd29dSMark Adams 291bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 292bddcd29dSMark Adams { 293bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 294bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 295bddcd29dSMark Adams PetscBool row_identity,col_identity; 296365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 297bddcd29dSMark Adams 298bddcd29dSMark Adams PetscFunctionBegin; 2995f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 3005f80ce2aSJacob Faibussowitsch CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info)); 301bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 302bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 3035f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(isrow,&row_identity)); 3045f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(iscol,&col_identity)); 305bddcd29dSMark Adams if (row_identity && col_identity) { 306365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 307bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 308bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 309365b711fSMark Adams } 310bddcd29dSMark Adams B->ops->matsolve = NULL; 311bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 312bddcd29dSMark Adams } else { 313365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 314bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 315bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 316365b711fSMark Adams } 317bddcd29dSMark Adams B->ops->matsolve = NULL; 318bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 319bddcd29dSMark Adams } 320bddcd29dSMark Adams 321bddcd29dSMark Adams /* get the triangular factors */ 322365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 3235f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 324365b711fSMark Adams } 325bddcd29dSMark Adams PetscFunctionReturn(0); 326bddcd29dSMark Adams } 327bddcd29dSMark Adams 3284416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 3299ae82921SPaul Mullowney { 3309ae82921SPaul Mullowney PetscErrorCode ierr; 331e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 3329ae82921SPaul Mullowney PetscBool flg; 333a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3346e111a19SKarl Rupp 3359ae82921SPaul Mullowney PetscFunctionBegin; 3365f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options")); 3379ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 338e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 339a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 3405f80ce2aSJacob Faibussowitsch if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 341afb2bd1cSJunchao Zhang 3424c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 343a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 3445f80ce2aSJacob Faibussowitsch if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 3455f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 3465f80ce2aSJacob Faibussowitsch if (flg) CHKERRQ(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 347afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 348afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 349afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 350afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 3518efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3522c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 353a435da06SStefano Zampini #else 3542c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 355a435da06SStefano Zampini #endif 356afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 357afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 3582c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 359afb2bd1cSJunchao Zhang 360afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 361afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 3622c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 363afb2bd1cSJunchao Zhang #endif 3644c87dfd4SPaul Mullowney } 3655f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsTail()); 3669ae82921SPaul Mullowney PetscFunctionReturn(0); 3679ae82921SPaul Mullowney } 3689ae82921SPaul Mullowney 3696fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3709ae82921SPaul Mullowney { 371da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3729ae82921SPaul Mullowney 3739ae82921SPaul Mullowney PetscFunctionBegin; 3745f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3755f80ce2aSJacob Faibussowitsch CHKERRQ(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 3769ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3779ae82921SPaul Mullowney PetscFunctionReturn(0); 3789ae82921SPaul Mullowney } 3799ae82921SPaul Mullowney 3806fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3819ae82921SPaul Mullowney { 382da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3839ae82921SPaul Mullowney 3849ae82921SPaul Mullowney PetscFunctionBegin; 3855f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3865f80ce2aSJacob Faibussowitsch CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 3879ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3889ae82921SPaul Mullowney PetscFunctionReturn(0); 3899ae82921SPaul Mullowney } 3909ae82921SPaul Mullowney 391087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 392087f3262SPaul Mullowney { 393da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 394087f3262SPaul Mullowney 395087f3262SPaul Mullowney PetscFunctionBegin; 3965f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3975f80ce2aSJacob Faibussowitsch CHKERRQ(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 398087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 399087f3262SPaul Mullowney PetscFunctionReturn(0); 400087f3262SPaul Mullowney } 401087f3262SPaul Mullowney 402087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 403087f3262SPaul Mullowney { 404da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 405087f3262SPaul Mullowney 406087f3262SPaul Mullowney PetscFunctionBegin; 4075f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 4085f80ce2aSJacob Faibussowitsch CHKERRQ(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 409087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 410087f3262SPaul Mullowney PetscFunctionReturn(0); 411087f3262SPaul Mullowney } 412087f3262SPaul Mullowney 413087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 4149ae82921SPaul Mullowney { 4159ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4169ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4179ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 418aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 4199ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 4209ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4219ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 4229ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 4239ae82921SPaul Mullowney 4249ae82921SPaul Mullowney PetscFunctionBegin; 425cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 426c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4279ae82921SPaul Mullowney try { 4289ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 4299ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 430da79fbbcSStefano Zampini if (!loTriFactor) { 4312cbc15d9SMark PetscScalar *AALo; 4322cbc15d9SMark 4335f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 4349ae82921SPaul Mullowney 4359ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 4365f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 4375f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 4389ae82921SPaul Mullowney 4399ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 4409ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 4419ae82921SPaul Mullowney AiLo[n] = nzLower; 4429ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4439ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4449ae82921SPaul Mullowney v = aa; 4459ae82921SPaul Mullowney vi = aj; 4469ae82921SPaul Mullowney offset = 1; 4479ae82921SPaul Mullowney rowOffset= 1; 4489ae82921SPaul Mullowney for (i=1; i<n; i++) { 4499ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 450e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4519ae82921SPaul Mullowney AiLo[i] = rowOffset; 4529ae82921SPaul Mullowney rowOffset += nz+1; 4539ae82921SPaul Mullowney 4545f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AjLo[offset]), vi, nz)); 4555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AALo[offset]), v, nz)); 4569ae82921SPaul Mullowney 4579ae82921SPaul Mullowney offset += nz; 4589ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4599ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4609ae82921SPaul Mullowney offset += 1; 4619ae82921SPaul Mullowney 4629ae82921SPaul Mullowney v += nz; 4639ae82921SPaul Mullowney vi += nz; 4649ae82921SPaul Mullowney } 4652205254eSKarl Rupp 466aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&loTriFactor)); 468da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 469aa372e3fSPaul Mullowney /* Create the matrix description */ 4705f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 4715f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4735f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 474afb2bd1cSJunchao Zhang #else 4755f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 476afb2bd1cSJunchao Zhang #endif 4775f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 4785f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 479aa372e3fSPaul Mullowney 480aa372e3fSPaul Mullowney /* set the operation */ 481aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 482aa372e3fSPaul Mullowney 483aa372e3fSPaul Mullowney /* set the matrix */ 484aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 485aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 486aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 487aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 488aa372e3fSPaul Mullowney 489aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 490aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 491aa372e3fSPaul Mullowney 492aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 493aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 494aa372e3fSPaul Mullowney 495aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 496aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 497aa372e3fSPaul Mullowney 498afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 5005f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 5011b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 5025f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 503afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 504afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 505afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 5065f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 5075f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 508afb2bd1cSJunchao Zhang #endif 509afb2bd1cSJunchao Zhang 510aa372e3fSPaul Mullowney /* perform the solve analysis */ 5115f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 512aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 513aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 514d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 5151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 516d49cd2b7SBarry Smith loTriFactor->solveInfo, 5175f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 518d49cd2b7SBarry Smith #else 5195f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 520afb2bd1cSJunchao Zhang #endif 5215f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 5225f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 523aa372e3fSPaul Mullowney 524da79fbbcSStefano Zampini /* assign the pointer */ 525aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 5262cbc15d9SMark loTriFactor->AA_h = AALo; 5275f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AiLo)); 5285f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AjLo)); 5295f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 530da79fbbcSStefano Zampini } else { /* update values only */ 5312cbc15d9SMark if (!loTriFactor->AA_h) { 5325f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 5332cbc15d9SMark } 534da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 5352cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 536da79fbbcSStefano Zampini v = aa; 537da79fbbcSStefano Zampini vi = aj; 538da79fbbcSStefano Zampini offset = 1; 539da79fbbcSStefano Zampini for (i=1; i<n; i++) { 540da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 5415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 542da79fbbcSStefano Zampini offset += nz; 5432cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 544da79fbbcSStefano Zampini offset += 1; 545da79fbbcSStefano Zampini v += nz; 546da79fbbcSStefano Zampini } 5472cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 5485f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 549da79fbbcSStefano Zampini } 5509ae82921SPaul Mullowney } catch(char *ex) { 55198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5529ae82921SPaul Mullowney } 5539ae82921SPaul Mullowney } 5549ae82921SPaul Mullowney PetscFunctionReturn(0); 5559ae82921SPaul Mullowney } 5569ae82921SPaul Mullowney 557087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5589ae82921SPaul Mullowney { 5599ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5609ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5619ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 562aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5639ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5649ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5659ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5669ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5679ae82921SPaul Mullowney 5689ae82921SPaul Mullowney PetscFunctionBegin; 569cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 570c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5719ae82921SPaul Mullowney try { 5729ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5739ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 574da79fbbcSStefano Zampini if (!upTriFactor) { 5752cbc15d9SMark PetscScalar *AAUp; 5762cbc15d9SMark 5775f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 5782cbc15d9SMark 5799ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 5805f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 5815f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 5829ae82921SPaul Mullowney 5839ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5849ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5859ae82921SPaul Mullowney AiUp[n]=nzUpper; 5869ae82921SPaul Mullowney offset = nzUpper; 5879ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5889ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5899ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5909ae82921SPaul Mullowney 591e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5929ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5939ae82921SPaul Mullowney 594e057df02SPaul Mullowney /* decrement the offset */ 5959ae82921SPaul Mullowney offset -= (nz+1); 5969ae82921SPaul Mullowney 597e057df02SPaul Mullowney /* first, set the diagonal elements */ 5989ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 59909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 6009ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 6019ae82921SPaul Mullowney 6025f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 6035f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 6049ae82921SPaul Mullowney } 6052205254eSKarl Rupp 606aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6075f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&upTriFactor)); 608da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 6092205254eSKarl Rupp 610aa372e3fSPaul Mullowney /* Create the matrix description */ 6115f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6125f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 6145f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 615afb2bd1cSJunchao Zhang #else 6165f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 617afb2bd1cSJunchao Zhang #endif 6185f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6195f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 620aa372e3fSPaul Mullowney 621aa372e3fSPaul Mullowney /* set the operation */ 622aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 623aa372e3fSPaul Mullowney 624aa372e3fSPaul Mullowney /* set the matrix */ 625aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 626aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 627aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 628aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 629aa372e3fSPaul Mullowney 630aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 631aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 632aa372e3fSPaul Mullowney 633aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 634aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 635aa372e3fSPaul Mullowney 636aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 637aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 638aa372e3fSPaul Mullowney 639afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 6415f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 6421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 6435f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 644afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 645afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 646afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 6475f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 6485f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 649afb2bd1cSJunchao Zhang #endif 650afb2bd1cSJunchao Zhang 651aa372e3fSPaul Mullowney /* perform the solve analysis */ 6525f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 653aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 654aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 655d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 657d49cd2b7SBarry Smith upTriFactor->solveInfo, 6585f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 659d49cd2b7SBarry Smith #else 6605f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 661afb2bd1cSJunchao Zhang #endif 6625f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 6635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 664aa372e3fSPaul Mullowney 665da79fbbcSStefano Zampini /* assign the pointer */ 666aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6672cbc15d9SMark upTriFactor->AA_h = AAUp; 6685f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AiUp)); 6695f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AjUp)); 6705f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 671da79fbbcSStefano Zampini } else { 6722cbc15d9SMark if (!upTriFactor->AA_h) { 6735f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 6742cbc15d9SMark } 675da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 676da79fbbcSStefano Zampini offset = nzUpper; 677da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 678da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 679da79fbbcSStefano Zampini 680da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 681da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 682da79fbbcSStefano Zampini 683da79fbbcSStefano Zampini /* decrement the offset */ 684da79fbbcSStefano Zampini offset -= (nz+1); 685da79fbbcSStefano Zampini 686da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6872cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6885f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 689da79fbbcSStefano Zampini } 6902cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 6915f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 692da79fbbcSStefano Zampini } 6939ae82921SPaul Mullowney } catch(char *ex) { 69498921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6959ae82921SPaul Mullowney } 6969ae82921SPaul Mullowney } 6979ae82921SPaul Mullowney PetscFunctionReturn(0); 6989ae82921SPaul Mullowney } 6999ae82921SPaul Mullowney 700087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 7019ae82921SPaul Mullowney { 7029ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 7039ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7049ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 7059ae82921SPaul Mullowney PetscBool row_identity,col_identity; 7069ae82921SPaul Mullowney PetscInt n = A->rmap->n; 7079ae82921SPaul Mullowney 7089ae82921SPaul Mullowney PetscFunctionBegin; 709*28b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 7105f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 7115f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 7122205254eSKarl Rupp 713da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 714aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 7159ae82921SPaul Mullowney 716c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 717e057df02SPaul Mullowney /* lower triangular indices */ 7185f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(isrow,&row_identity)); 719da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 720da79fbbcSStefano Zampini const PetscInt *r; 721da79fbbcSStefano Zampini 7225f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(isrow,&r)); 723aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 724aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 7255f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(isrow,&r)); 7265f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt))); 727da79fbbcSStefano Zampini } 7289ae82921SPaul Mullowney 729e057df02SPaul Mullowney /* upper triangular indices */ 7305f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(iscol,&col_identity)); 731da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 732da79fbbcSStefano Zampini const PetscInt *c; 733da79fbbcSStefano Zampini 7345f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(iscol,&c)); 735aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 736aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 7375f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(iscol,&c)); 7385f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt))); 739da79fbbcSStefano Zampini } 7409ae82921SPaul Mullowney PetscFunctionReturn(0); 7419ae82921SPaul Mullowney } 7429ae82921SPaul Mullowney 743087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 744087f3262SPaul Mullowney { 745087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 746087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 747aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 748aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 749087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 750087f3262SPaul Mullowney PetscScalar *AAUp; 751087f3262SPaul Mullowney PetscScalar *AALo; 752087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 753087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 754087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 755087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 756087f3262SPaul Mullowney 757087f3262SPaul Mullowney PetscFunctionBegin; 758cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 759c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 760087f3262SPaul Mullowney try { 7615f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 7625f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 763da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 764087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 7655f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 7665f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 767087f3262SPaul Mullowney 768087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 769087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 770087f3262SPaul Mullowney AiUp[n]=nzUpper; 771087f3262SPaul Mullowney offset = 0; 772087f3262SPaul Mullowney for (i=0; i<n; i++) { 773087f3262SPaul Mullowney /* set the pointers */ 774087f3262SPaul Mullowney v = aa + ai[i]; 775087f3262SPaul Mullowney vj = aj + ai[i]; 776087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 777087f3262SPaul Mullowney 778087f3262SPaul Mullowney /* first, set the diagonal elements */ 779087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 78009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 781087f3262SPaul Mullowney AiUp[i] = offset; 78209f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 783087f3262SPaul Mullowney 784087f3262SPaul Mullowney offset+=1; 785087f3262SPaul Mullowney if (nz>0) { 7865f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AjUp[offset]), vj, nz)); 7875f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz)); 788087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 789087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 790087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 791087f3262SPaul Mullowney } 792087f3262SPaul Mullowney offset+=nz; 793087f3262SPaul Mullowney } 794087f3262SPaul Mullowney } 795087f3262SPaul Mullowney 796aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 7975f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&upTriFactor)); 798da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799087f3262SPaul Mullowney 800aa372e3fSPaul Mullowney /* Create the matrix description */ 8015f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 8025f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8045f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 805afb2bd1cSJunchao Zhang #else 8065f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 807afb2bd1cSJunchao Zhang #endif 8085f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8095f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 810087f3262SPaul Mullowney 811aa372e3fSPaul Mullowney /* set the matrix */ 812aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 813aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 814aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 815aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 816aa372e3fSPaul Mullowney 817aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 818aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 819aa372e3fSPaul Mullowney 820aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 821aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 822aa372e3fSPaul Mullowney 823aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 824aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 825aa372e3fSPaul Mullowney 826afb2bd1cSJunchao Zhang /* set the operation */ 827afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 828afb2bd1cSJunchao Zhang 829afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8305f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 8315f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 8321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8335f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 834afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 835afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 836afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 8375f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 8385f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 839afb2bd1cSJunchao Zhang #endif 840afb2bd1cSJunchao Zhang 841aa372e3fSPaul Mullowney /* perform the solve analysis */ 8425f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 843aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 844aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 845d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847d49cd2b7SBarry Smith upTriFactor->solveInfo, 8485f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 849d49cd2b7SBarry Smith #else 8505f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 851afb2bd1cSJunchao Zhang #endif 8525f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 8535f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 854aa372e3fSPaul Mullowney 855da79fbbcSStefano Zampini /* assign the pointer */ 856aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 857aa372e3fSPaul Mullowney 858aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8595f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&loTriFactor)); 860da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 861aa372e3fSPaul Mullowney 862aa372e3fSPaul Mullowney /* Create the matrix description */ 8635f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 8645f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8665f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 867afb2bd1cSJunchao Zhang #else 8685f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 869afb2bd1cSJunchao Zhang #endif 8705f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8715f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 872aa372e3fSPaul Mullowney 873aa372e3fSPaul Mullowney /* set the operation */ 874aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 875aa372e3fSPaul Mullowney 876aa372e3fSPaul Mullowney /* set the matrix */ 877aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 878aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 879aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 880aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 881aa372e3fSPaul Mullowney 882aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 883aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 884aa372e3fSPaul Mullowney 885aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 886aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 887aa372e3fSPaul Mullowney 888aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 889aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 890aa372e3fSPaul Mullowney 891afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 8935f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 8941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8955f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 896afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 897afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 898afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 8995f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 9005f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 901afb2bd1cSJunchao Zhang #endif 902afb2bd1cSJunchao Zhang 903aa372e3fSPaul Mullowney /* perform the solve analysis */ 9045f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 905aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 906aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 907d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 9081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 909d49cd2b7SBarry Smith loTriFactor->solveInfo, 9105f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 911d49cd2b7SBarry Smith #else 9125f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 913afb2bd1cSJunchao Zhang #endif 9145f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 9155f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 916aa372e3fSPaul Mullowney 917da79fbbcSStefano Zampini /* assign the pointer */ 918aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 919087f3262SPaul Mullowney 9205f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 9215f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AiUp)); 9225f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AjUp)); 923da79fbbcSStefano Zampini } else { 924da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 925da79fbbcSStefano Zampini offset = 0; 926da79fbbcSStefano Zampini for (i=0; i<n; i++) { 927da79fbbcSStefano Zampini /* set the pointers */ 928da79fbbcSStefano Zampini v = aa + ai[i]; 929da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 930da79fbbcSStefano Zampini 931da79fbbcSStefano Zampini /* first, set the diagonal elements */ 932da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 933da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 934da79fbbcSStefano Zampini 935da79fbbcSStefano Zampini offset+=1; 936da79fbbcSStefano Zampini if (nz>0) { 9375f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz)); 938da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 939da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 940da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 941da79fbbcSStefano Zampini } 942da79fbbcSStefano Zampini offset+=nz; 943da79fbbcSStefano Zampini } 944da79fbbcSStefano Zampini } 945*28b400f6SJacob Faibussowitsch PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 946*28b400f6SJacob Faibussowitsch PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 947da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 948da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 9495f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 950da79fbbcSStefano Zampini } 9515f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AAUp)); 9525f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFreeHost(AALo)); 953087f3262SPaul Mullowney } catch(char *ex) { 95498921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 955087f3262SPaul Mullowney } 956087f3262SPaul Mullowney } 957087f3262SPaul Mullowney PetscFunctionReturn(0); 958087f3262SPaul Mullowney } 959087f3262SPaul Mullowney 960087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9619ae82921SPaul Mullowney { 962087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 963087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 964087f3262SPaul Mullowney IS ip = a->row; 965087f3262SPaul Mullowney PetscBool perm_identity; 966087f3262SPaul Mullowney PetscInt n = A->rmap->n; 967087f3262SPaul Mullowney 968087f3262SPaul Mullowney PetscFunctionBegin; 969*28b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 9705f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 971da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 972aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 973aa372e3fSPaul Mullowney 974da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 975da79fbbcSStefano Zampini 976087f3262SPaul Mullowney /* lower triangular indices */ 9775f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(ip,&perm_identity)); 978087f3262SPaul Mullowney if (!perm_identity) { 9794e4bbfaaSStefano Zampini IS iip; 980da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9814e4bbfaaSStefano Zampini 9825f80ce2aSJacob Faibussowitsch CHKERRQ(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 9835f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(iip,&irip)); 9845f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(ip,&rip)); 985aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 986aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 987aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9884e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9895f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(iip,&irip)); 9905f80ce2aSJacob Faibussowitsch CHKERRQ(ISDestroy(&iip)); 9915f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(ip,&rip)); 9925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 993da79fbbcSStefano Zampini } 994087f3262SPaul Mullowney PetscFunctionReturn(0); 995087f3262SPaul Mullowney } 996087f3262SPaul Mullowney 997087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 998087f3262SPaul Mullowney { 999087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1000087f3262SPaul Mullowney IS ip = b->row; 1001087f3262SPaul Mullowney PetscBool perm_identity; 1002087f3262SPaul Mullowney 1003087f3262SPaul Mullowney PetscFunctionBegin; 10045f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 10055f80ce2aSJacob Faibussowitsch CHKERRQ(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 1006ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1007087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 10085f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(ip,&perm_identity)); 1009087f3262SPaul Mullowney if (perm_identity) { 1010087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1011087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 10124e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10134e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1014087f3262SPaul Mullowney } else { 1015087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1016087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 10174e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10184e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1019087f3262SPaul Mullowney } 1020087f3262SPaul Mullowney 1021087f3262SPaul Mullowney /* get the triangular factors */ 10225f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1023087f3262SPaul Mullowney PetscFunctionReturn(0); 1024087f3262SPaul Mullowney } 10259ae82921SPaul Mullowney 1026b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1027bda325fcSPaul Mullowney { 1028bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1029aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1030aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1031da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1032da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1033aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1034aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1035aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1036aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 1037b175d8bbSPaul Mullowney 1038bda325fcSPaul Mullowney PetscFunctionBegin; 1039aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&loTriFactorT)); 1041da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1042aa372e3fSPaul Mullowney 1043aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1044aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1045aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1046aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1047aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1048aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1049aa372e3fSPaul Mullowney 1050aa372e3fSPaul Mullowney /* Create the matrix description */ 10515f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10525f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10535f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10545f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10555f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1056aa372e3fSPaul Mullowney 1057aa372e3fSPaul Mullowney /* set the operation */ 1058aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1059aa372e3fSPaul Mullowney 1060aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1061aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1062afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1063afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1064aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1065afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1066afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1067afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1068aa372e3fSPaul Mullowney 1069aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1070afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 10715f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1072afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1073afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1074afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1075afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1077afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1078afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 10795f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 10805f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1081afb2bd1cSJunchao Zhang #endif 1082afb2bd1cSJunchao Zhang 10835f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 10845f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1085aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1086aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1087aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1088aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1089aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1090afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1091afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1092afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 10935f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1094afb2bd1cSJunchao Zhang #else 1095afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 10965f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1097afb2bd1cSJunchao Zhang #endif 10985f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 10995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1100aa372e3fSPaul Mullowney 1101afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11025f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 11035f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 11041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 11055f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1106afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1107afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1108afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 11095f80ce2aSJacob Faibussowitsch &loTriFactorT->solveBufferSize)); 11105f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1111afb2bd1cSJunchao Zhang #endif 1112afb2bd1cSJunchao Zhang 1113afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11145f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1115afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1116afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1117d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 11181b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1119d49cd2b7SBarry Smith loTriFactorT->solveInfo, 11205f80ce2aSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1121d49cd2b7SBarry Smith #else 11225f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 1123afb2bd1cSJunchao Zhang #endif 11245f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 11255f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1126aa372e3fSPaul Mullowney 1127da79fbbcSStefano Zampini /* assign the pointer */ 1128aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1129aa372e3fSPaul Mullowney 1130aa372e3fSPaul Mullowney /*********************************************/ 1131aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1132aa372e3fSPaul Mullowney /*********************************************/ 1133aa372e3fSPaul Mullowney 1134aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 11355f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&upTriFactorT)); 1136da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1137aa372e3fSPaul Mullowney 1138aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1139aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1140aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1141aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1142aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1143aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1144aa372e3fSPaul Mullowney 1145aa372e3fSPaul Mullowney /* Create the matrix description */ 11465f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11475f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11485f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11495f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11505f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1151aa372e3fSPaul Mullowney 1152aa372e3fSPaul Mullowney /* set the operation */ 1153aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1154aa372e3fSPaul Mullowney 1155aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1156aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1157afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1158afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1159aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1160afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1161afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1162afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1163aa372e3fSPaul Mullowney 1164aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1165afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 11665f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1167afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1168afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1169afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1170afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1171afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1172afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1173afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 11745f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11755f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1176afb2bd1cSJunchao Zhang #endif 1177afb2bd1cSJunchao Zhang 11785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 11795f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1180aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1181aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1182aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1183aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1184aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1185afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1186afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1187afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 11885f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1189afb2bd1cSJunchao Zhang #else 1190afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11915f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1192afb2bd1cSJunchao Zhang #endif 1193d49cd2b7SBarry Smith 11945f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 11955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1196aa372e3fSPaul Mullowney 1197afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 11995f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 12001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 12015f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1202afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1203afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1204afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 12055f80ce2aSJacob Faibussowitsch &upTriFactorT->solveBufferSize)); 12065f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1207afb2bd1cSJunchao Zhang #endif 1208afb2bd1cSJunchao Zhang 1209afb2bd1cSJunchao Zhang /* perform the solve analysis */ 12105f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 12115f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1212afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1213afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1214d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 12151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1216d49cd2b7SBarry Smith upTriFactorT->solveInfo, 12175f80ce2aSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1218d49cd2b7SBarry Smith #else 12195f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1220afb2bd1cSJunchao Zhang #endif 1221d49cd2b7SBarry Smith 12225f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 12235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1224aa372e3fSPaul Mullowney 1225da79fbbcSStefano Zampini /* assign the pointer */ 1226aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1227bda325fcSPaul Mullowney PetscFunctionReturn(0); 1228bda325fcSPaul Mullowney } 1229bda325fcSPaul Mullowney 1230a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1231a49f1ed0SStefano Zampini { 1232a49f1ed0SStefano Zampini __host__ __device__ 1233a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1234a49f1ed0SStefano Zampini { 1235a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1236a49f1ed0SStefano Zampini } 1237a49f1ed0SStefano Zampini }; 1238a49f1ed0SStefano Zampini 12393606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1240bda325fcSPaul Mullowney { 1241aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1242a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1243bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1244bda325fcSPaul Mullowney cusparseStatus_t stat; 1245aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1246b175d8bbSPaul Mullowney 1247bda325fcSPaul Mullowney PetscFunctionBegin; 12485f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 1249a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1250*28b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1251a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 12522c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12531a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 12545f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 12555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1256a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 12575f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1258a49f1ed0SStefano Zampini } 1259a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1260aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12615f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1262aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12635f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12645f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1265aa372e3fSPaul Mullowney 1266b06137fdSPaul Mullowney /* set alpha and beta */ 12675f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 12685f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 12695f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 12705f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 12715f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 12725f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1273b06137fdSPaul Mullowney 1274aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1275aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1276a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1277554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1278554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1279aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1280a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1281aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1282aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1283a3fdcf43SKarl Rupp 1284039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 128581902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1286afb2bd1cSJunchao Zhang 1287afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 12883606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1289afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1290afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1291afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1292afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1293afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1294afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 12953606e59fSJunchao Zhang #else 12963606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12973606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12983606e59fSJunchao Zhang 12993606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 13003606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 13013606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 13023606e59fSJunchao Zhang */ 13033606e59fSJunchao Zhang if (matrixT->num_entries) { 13043606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 13053606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 13063606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 13073606e59fSJunchao Zhang matrixT->values->data().get(), 13083606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 13093606e59fSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 13103606e59fSJunchao Zhang 13113606e59fSJunchao Zhang } else { 13123606e59fSJunchao Zhang matstructT->matDescr = NULL; 13133606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13143606e59fSJunchao Zhang } 13153606e59fSJunchao Zhang #endif 1316afb2bd1cSJunchao Zhang #endif 1317aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1318afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1319afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1320afb2bd1cSJunchao Zhang #else 1321aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 132251c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 132351c6d536SStefano Zampini /* First convert HYB to CSR */ 1324aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1325aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1326aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1327aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1328aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1329aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1330aa372e3fSPaul Mullowney 1331aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1332aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1333aa372e3fSPaul Mullowney temp->values->data().get(), 1334aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 133557d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1336aa372e3fSPaul Mullowney 1337aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1338aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1339aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1340aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1341aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1342aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1343aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1344aa372e3fSPaul Mullowney 1345aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1346aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1347aa372e3fSPaul Mullowney temp->values->data().get(), 1348aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1349aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1350aa372e3fSPaul Mullowney tempT->values->data().get(), 1351aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1352aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 135357d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1354aa372e3fSPaul Mullowney 1355aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1356aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13575f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat)); 1358aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1359aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1360aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1361aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1362aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1363aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 136457d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1365aa372e3fSPaul Mullowney 1366aa372e3fSPaul Mullowney /* assign the pointer */ 1367aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13681a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1369aa372e3fSPaul Mullowney /* delete temporaries */ 1370aa372e3fSPaul Mullowney if (tempT) { 1371aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1372aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1373aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1374aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1375087f3262SPaul Mullowney } 1376aa372e3fSPaul Mullowney if (temp) { 1377aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1378aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1379aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1380aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1381aa372e3fSPaul Mullowney } 1382afb2bd1cSJunchao Zhang #endif 1383aa372e3fSPaul Mullowney } 1384a49f1ed0SStefano Zampini } 1385a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1386a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1387a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1388*28b400f6SJacob Faibussowitsch PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1389*28b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1390*28b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1391*28b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1392*28b400f6SJacob Faibussowitsch PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1393*28b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1394*28b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1395*28b400f6SJacob Faibussowitsch PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1396a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1397a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1398a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 13995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1400a49f1ed0SStefano Zampini } 1401a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1402a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1403a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1404a49f1ed0SStefano Zampini 1405a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1406a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1407a49f1ed0SStefano Zampini void *csr2cscBuffer; 1408a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1409a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1410a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1411a49f1ed0SStefano Zampini matrix->values->data().get(), 1412a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1413a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1414a49f1ed0SStefano Zampini matrixT->values->data().get(), 1415a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1416a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1417a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 14185f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1419a49f1ed0SStefano Zampini #endif 1420a49f1ed0SStefano Zampini 14211a2c6b5cSJunchao Zhang if (matrix->num_entries) { 14221a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 14231a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 14241a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 14251a2c6b5cSJunchao Zhang 14261a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 14271a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 14281a2c6b5cSJunchao Zhang */ 14291a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 14301a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 14311a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 14321a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 14331a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1434a49f1ed0SStefano Zampini matrixT->values->data().get(), 1435a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1436a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1437a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 14381a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1439a49f1ed0SStefano Zampini #else 1440a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14411a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1442a49f1ed0SStefano Zampini #endif 14431a2c6b5cSJunchao Zhang } else { 14441a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14451a2c6b5cSJunchao Zhang } 14461a2c6b5cSJunchao Zhang 1447a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1448a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1449a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 14505f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(csr2cscBuffer)); 1451a49f1ed0SStefano Zampini #endif 1452a49f1ed0SStefano Zampini } 1453a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1454a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1455a49f1ed0SStefano Zampini matrixT->values->begin())); 1456a49f1ed0SStefano Zampini } 14575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 14585f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1459213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1460213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1461aa372e3fSPaul Mullowney /* assign the pointer */ 1462aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14631a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1464bda325fcSPaul Mullowney PetscFunctionReturn(0); 1465bda325fcSPaul Mullowney } 1466bda325fcSPaul Mullowney 1467a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14686fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1469bda325fcSPaul Mullowney { 1470c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1471465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1472465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1473465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1474465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1475bda325fcSPaul Mullowney cusparseStatus_t stat; 1476bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1477aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1478aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1479aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1480bda325fcSPaul Mullowney 1481bda325fcSPaul Mullowney PetscFunctionBegin; 1482aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1483aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 14845f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1485aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1486aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1487bda325fcSPaul Mullowney } 1488bda325fcSPaul Mullowney 1489bda325fcSPaul Mullowney /* Get the GPU pointers */ 14905f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 14915f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1492c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1493c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1494bda325fcSPaul Mullowney 14955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1496aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1497a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1498c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1499c41cb2e2SAlejandro Lamas Daviña xGPU); 1500aa372e3fSPaul Mullowney 1501aa372e3fSPaul Mullowney /* First, solve U */ 1502aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1503afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1505afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1506afb2bd1cSJunchao Zhang #endif 1507afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1508aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1509aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1510aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1511aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1512d49cd2b7SBarry Smith xarray, 15131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1514d49cd2b7SBarry Smith tempGPU->data().get(), 1515d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1516d49cd2b7SBarry Smith #else 1517d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1518afb2bd1cSJunchao Zhang #endif 1519aa372e3fSPaul Mullowney 1520aa372e3fSPaul Mullowney /* Then, solve L */ 1521aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1522afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1524afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1525afb2bd1cSJunchao Zhang #endif 1526afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1527aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1528aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1529aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1530aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1531d49cd2b7SBarry Smith tempGPU->data().get(), 15321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533d49cd2b7SBarry Smith xarray, 1534d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1535d49cd2b7SBarry Smith #else 1536d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1537afb2bd1cSJunchao Zhang #endif 1538aa372e3fSPaul Mullowney 1539aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1540a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1541c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1542aa372e3fSPaul Mullowney tempGPU->begin()); 1543aa372e3fSPaul Mullowney 1544aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1545a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1546bda325fcSPaul Mullowney 1547bda325fcSPaul Mullowney /* restore */ 15485f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 15495f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 15505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 15515f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1552bda325fcSPaul Mullowney PetscFunctionReturn(0); 1553bda325fcSPaul Mullowney } 1554bda325fcSPaul Mullowney 15556fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1556bda325fcSPaul Mullowney { 1557465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1558465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1559bda325fcSPaul Mullowney cusparseStatus_t stat; 1560bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1561aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1562aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1563aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1564bda325fcSPaul Mullowney 1565bda325fcSPaul Mullowney PetscFunctionBegin; 1566aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1567aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15685f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1569aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1570aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1571bda325fcSPaul Mullowney } 1572bda325fcSPaul Mullowney 1573bda325fcSPaul Mullowney /* Get the GPU pointers */ 15745f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 15755f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1576bda325fcSPaul Mullowney 15775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1578aa372e3fSPaul Mullowney /* First, solve U */ 1579aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1580afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1582afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1583afb2bd1cSJunchao Zhang #endif 1584afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1585aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1586aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1587aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1588aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1589d49cd2b7SBarry Smith barray, 15901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1591d49cd2b7SBarry Smith tempGPU->data().get(), 1592d49cd2b7SBarry Smith upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1593d49cd2b7SBarry Smith #else 1594d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1595afb2bd1cSJunchao Zhang #endif 1596aa372e3fSPaul Mullowney 1597aa372e3fSPaul Mullowney /* Then, solve L */ 1598aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1599afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 16001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1601afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1602afb2bd1cSJunchao Zhang #endif 1603afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1604aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1605aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1606aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1607aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1608d49cd2b7SBarry Smith tempGPU->data().get(), 16091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1610d49cd2b7SBarry Smith xarray, 1611d49cd2b7SBarry Smith loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1612d49cd2b7SBarry Smith #else 1613d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1614afb2bd1cSJunchao Zhang #endif 1615bda325fcSPaul Mullowney 1616bda325fcSPaul Mullowney /* restore */ 16175f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 16185f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 16195f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 16205f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1621bda325fcSPaul Mullowney PetscFunctionReturn(0); 1622bda325fcSPaul Mullowney } 1623bda325fcSPaul Mullowney 16246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 16259ae82921SPaul Mullowney { 1626465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1627465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1628465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1629465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16309ae82921SPaul Mullowney cusparseStatus_t stat; 16319ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1632aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1633aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1634aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 16359ae82921SPaul Mullowney 16369ae82921SPaul Mullowney PetscFunctionBegin; 1637ebc8f436SDominic Meiser 1638e057df02SPaul Mullowney /* Get the GPU pointers */ 16395f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 16405f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1641c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1642c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16439ae82921SPaul Mullowney 16445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1645aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1646a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1647c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16484e4bbfaaSStefano Zampini tempGPU->begin()); 1649aa372e3fSPaul Mullowney 1650aa372e3fSPaul Mullowney /* Next, solve L */ 1651aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1652afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16531b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1654afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1655afb2bd1cSJunchao Zhang #endif 1656afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1657aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1658aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1659aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1660aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1661d49cd2b7SBarry Smith tempGPU->data().get(), 16621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1663d49cd2b7SBarry Smith xarray, 1664d49cd2b7SBarry Smith loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1665d49cd2b7SBarry Smith #else 1666d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1667afb2bd1cSJunchao Zhang #endif 1668aa372e3fSPaul Mullowney 1669aa372e3fSPaul Mullowney /* Then, solve U */ 1670aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1671afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1674afb2bd1cSJunchao Zhang #endif 1675afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1676aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1677aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1678aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1679d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 16801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1681d49cd2b7SBarry Smith tempGPU->data().get(), 1682d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1683d49cd2b7SBarry Smith #else 1684d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1685afb2bd1cSJunchao Zhang #endif 1686d49cd2b7SBarry Smith 16874e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1688a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16894e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16904e4bbfaaSStefano Zampini xGPU); 16919ae82921SPaul Mullowney 16925f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 16935f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 16945f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 16955f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 16969ae82921SPaul Mullowney PetscFunctionReturn(0); 16979ae82921SPaul Mullowney } 16989ae82921SPaul Mullowney 16996fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 17009ae82921SPaul Mullowney { 1701465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1702465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 17039ae82921SPaul Mullowney cusparseStatus_t stat; 17049ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1705aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1706aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1707aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 17089ae82921SPaul Mullowney 17099ae82921SPaul Mullowney PetscFunctionBegin; 1710e057df02SPaul Mullowney /* Get the GPU pointers */ 17115f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 17125f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 17139ae82921SPaul Mullowney 17145f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1715aa372e3fSPaul Mullowney /* First, solve L */ 1716aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1717afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 17181b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1719afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1720afb2bd1cSJunchao Zhang #endif 1721afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1722aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1723aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1724aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1725aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1726d49cd2b7SBarry Smith barray, 17271b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1728d49cd2b7SBarry Smith tempGPU->data().get(), 1729d49cd2b7SBarry Smith loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1730d49cd2b7SBarry Smith #else 1731d49cd2b7SBarry Smith tempGPU->data().get());CHKERRCUSPARSE(stat); 1732afb2bd1cSJunchao Zhang #endif 1733d49cd2b7SBarry Smith 1734aa372e3fSPaul Mullowney /* Next, solve U */ 1735aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1736afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 17371b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1738afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1739afb2bd1cSJunchao Zhang #endif 1740afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1741aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1742aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1743aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1744aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1745d49cd2b7SBarry Smith tempGPU->data().get(), 17461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1747d49cd2b7SBarry Smith xarray, 1748d49cd2b7SBarry Smith upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1749d49cd2b7SBarry Smith #else 1750d49cd2b7SBarry Smith xarray);CHKERRCUSPARSE(stat); 1751afb2bd1cSJunchao Zhang #endif 17529ae82921SPaul Mullowney 17535f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 17545f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 17555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 17565f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 17579ae82921SPaul Mullowney PetscFunctionReturn(0); 17589ae82921SPaul Mullowney } 17599ae82921SPaul Mullowney 17607e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17617e8381f9SStefano Zampini { 17627e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17637e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17647e8381f9SStefano Zampini 17657e8381f9SStefano Zampini PetscFunctionBegin; 17667e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17677e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17687e8381f9SStefano Zampini 17695f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17705f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 17715f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 17725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 17735f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17747e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17757e8381f9SStefano Zampini } 17767e8381f9SStefano Zampini PetscFunctionReturn(0); 17777e8381f9SStefano Zampini } 17787e8381f9SStefano Zampini 17797e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17807e8381f9SStefano Zampini { 17817e8381f9SStefano Zampini PetscFunctionBegin; 17825f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 178367a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 178467a45760SJunchao Zhang PetscFunctionReturn(0); 178567a45760SJunchao Zhang } 178667a45760SJunchao Zhang 178767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 178867a45760SJunchao Zhang { 178967a45760SJunchao Zhang PetscFunctionBegin; 17907e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 179167a45760SJunchao Zhang *array = NULL; 179267a45760SJunchao Zhang PetscFunctionReturn(0); 179367a45760SJunchao Zhang } 179467a45760SJunchao Zhang 179567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 179667a45760SJunchao Zhang { 179767a45760SJunchao Zhang PetscFunctionBegin; 17985f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 179967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 180067a45760SJunchao Zhang PetscFunctionReturn(0); 180167a45760SJunchao Zhang } 180267a45760SJunchao Zhang 180367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 180467a45760SJunchao Zhang { 180567a45760SJunchao Zhang PetscFunctionBegin; 180667a45760SJunchao Zhang *array = NULL; 180767a45760SJunchao Zhang PetscFunctionReturn(0); 180867a45760SJunchao Zhang } 180967a45760SJunchao Zhang 181067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 181167a45760SJunchao Zhang { 181267a45760SJunchao Zhang PetscFunctionBegin; 181367a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 181467a45760SJunchao Zhang PetscFunctionReturn(0); 181567a45760SJunchao Zhang } 181667a45760SJunchao Zhang 181767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 181867a45760SJunchao Zhang { 181967a45760SJunchao Zhang PetscFunctionBegin; 182067a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 182167a45760SJunchao Zhang *array = NULL; 18227e8381f9SStefano Zampini PetscFunctionReturn(0); 18237e8381f9SStefano Zampini } 18247e8381f9SStefano Zampini 1825042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 18269ae82921SPaul Mullowney { 1827aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 18287c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 18299ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1830213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1831aa372e3fSPaul Mullowney cusparseStatus_t stat; 1832abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 18339ae82921SPaul Mullowney 18349ae82921SPaul Mullowney PetscFunctionBegin; 1835*28b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1836c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1837a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1838a49f1ed0SStefano Zampini CsrMatrix *matrix; 1839afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 184085ba7357SStefano Zampini 18412c71b3e2SJacob Faibussowitsch PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 18425f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1843afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 18445f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 18455f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 18465f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18475f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 184834d6c7a5SJose E. Roman } else { 1849abb89eb1SStefano Zampini PetscInt nnz; 18505f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18515f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 18525f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 18537c700b8dSJunchao Zhang delete cusparsestruct->workVector; 185481902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1855a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1856a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18579ae82921SPaul Mullowney try { 18589ae82921SPaul Mullowney if (a->compressedrow.use) { 18599ae82921SPaul Mullowney m = a->compressedrow.nrows; 18609ae82921SPaul Mullowney ii = a->compressedrow.i; 18619ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 18629ae82921SPaul Mullowney } else { 1863213423ffSJunchao Zhang m = A->rmap->n; 1864213423ffSJunchao Zhang ii = a->i; 1865e6e9a74fSStefano Zampini ridx = NULL; 18669ae82921SPaul Mullowney } 18672c71b3e2SJacob Faibussowitsch PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1868abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1869abb89eb1SStefano Zampini else nnz = a->nz; 1870cbc6b225SStefano Zampini PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 18719ae82921SPaul Mullowney 187285ba7357SStefano Zampini /* create cusparse matrix */ 1873abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1874aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 18755f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 18765f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 18775f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 18789ae82921SPaul Mullowney 18795f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 18805f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 18815f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 18825f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18835f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18845f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18855f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1886b06137fdSPaul Mullowney 1887aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1888aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1889aa372e3fSPaul Mullowney /* set the matrix */ 1890afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1891afb2bd1cSJunchao Zhang mat->num_rows = m; 1892afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1893abb89eb1SStefano Zampini mat->num_entries = nnz; 1894afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1895afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18969ae82921SPaul Mullowney 1897abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1898abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1899aa372e3fSPaul Mullowney 1900abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1901abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1902aa372e3fSPaul Mullowney 1903aa372e3fSPaul Mullowney /* assign the pointer */ 1904afb2bd1cSJunchao Zhang matstruct->mat = mat; 1905afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1906afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1907afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1908afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1909afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1910afb2bd1cSJunchao Zhang mat->values->data().get(), 1911afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1912afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1913afb2bd1cSJunchao Zhang } 1914afb2bd1cSJunchao Zhang #endif 1915aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1916afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1917afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1918afb2bd1cSJunchao Zhang #else 1919afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1920afb2bd1cSJunchao Zhang mat->num_rows = m; 1921afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1922abb89eb1SStefano Zampini mat->num_entries = nnz; 1923afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1924afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1925aa372e3fSPaul Mullowney 1926abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1927abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1928aa372e3fSPaul Mullowney 1929abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1930abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1931aa372e3fSPaul Mullowney 1932aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 19335f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat)); 1934aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1935aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1936afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1937afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1938afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1939afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 194057d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1941aa372e3fSPaul Mullowney /* assign the pointer */ 1942aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1943aa372e3fSPaul Mullowney 1944afb2bd1cSJunchao Zhang if (mat) { 1945afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1946afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1947afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1948afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1949087f3262SPaul Mullowney } 1950afb2bd1cSJunchao Zhang #endif 1951087f3262SPaul Mullowney } 1952ca45077fSPaul Mullowney 1953aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1954213423ffSJunchao Zhang if (a->compressedrow.use) { 1955213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1956aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1957aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1958213423ffSJunchao Zhang tmp = m; 1959213423ffSJunchao Zhang } else { 1960213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1961213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1962213423ffSJunchao Zhang tmp = 0; 1963213423ffSJunchao Zhang } 19645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1965aa372e3fSPaul Mullowney 1966aa372e3fSPaul Mullowney /* assign the pointer */ 1967aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 19689ae82921SPaul Mullowney } catch(char *ex) { 196998921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 19709ae82921SPaul Mullowney } 19715f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 19725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 197334d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 197434d6c7a5SJose E. Roman } 1975abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 19769ae82921SPaul Mullowney } 19779ae82921SPaul Mullowney PetscFunctionReturn(0); 19789ae82921SPaul Mullowney } 19799ae82921SPaul Mullowney 1980c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1981aa372e3fSPaul Mullowney { 1982aa372e3fSPaul Mullowney template <typename Tuple> 1983aa372e3fSPaul Mullowney __host__ __device__ 1984aa372e3fSPaul Mullowney void operator()(Tuple t) 1985aa372e3fSPaul Mullowney { 1986aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1987aa372e3fSPaul Mullowney } 1988aa372e3fSPaul Mullowney }; 1989aa372e3fSPaul Mullowney 19907e8381f9SStefano Zampini struct VecCUDAEquals 19917e8381f9SStefano Zampini { 19927e8381f9SStefano Zampini template <typename Tuple> 19937e8381f9SStefano Zampini __host__ __device__ 19947e8381f9SStefano Zampini void operator()(Tuple t) 19957e8381f9SStefano Zampini { 19967e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19977e8381f9SStefano Zampini } 19987e8381f9SStefano Zampini }; 19997e8381f9SStefano Zampini 2000e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 2001e6e9a74fSStefano Zampini { 2002e6e9a74fSStefano Zampini template <typename Tuple> 2003e6e9a74fSStefano Zampini __host__ __device__ 2004e6e9a74fSStefano Zampini void operator()(Tuple t) 2005e6e9a74fSStefano Zampini { 2006e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2007e6e9a74fSStefano Zampini } 2008e6e9a74fSStefano Zampini }; 2009e6e9a74fSStefano Zampini 2010afb2bd1cSJunchao Zhang struct MatMatCusparse { 2011ccdfe979SStefano Zampini PetscBool cisdense; 2012ccdfe979SStefano Zampini PetscScalar *Bt; 2013ccdfe979SStefano Zampini Mat X; 2014fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2015fcdce8c4SStefano Zampini PetscLogDouble flops; 2016fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2017b4285af6SJunchao Zhang 2018afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2019fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2020afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2021afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2022afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2023afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2024b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2025b4285af6SJunchao Zhang void *dBuffer4; 2026b4285af6SJunchao Zhang void *dBuffer5; 2027b4285af6SJunchao Zhang #endif 2028fcdce8c4SStefano Zampini size_t mmBufferSize; 2029fcdce8c4SStefano Zampini void *mmBuffer; 2030fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2031fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2032afb2bd1cSJunchao Zhang #endif 2033afb2bd1cSJunchao Zhang }; 2034ccdfe979SStefano Zampini 2035ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2036ccdfe979SStefano Zampini { 2037ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2038ccdfe979SStefano Zampini 2039ccdfe979SStefano Zampini PetscFunctionBegin; 20405f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(mmdata->Bt)); 2041fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2042afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20435f80ce2aSJacob Faibussowitsch if (mmdata->matSpBDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 20445f80ce2aSJacob Faibussowitsch if (mmdata->matBDescr) CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 20455f80ce2aSJacob Faibussowitsch if (mmdata->matCDescr) CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 20465f80ce2aSJacob Faibussowitsch if (mmdata->spgemmDesc) CHKERRCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2047b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 20485f80ce2aSJacob Faibussowitsch if (mmdata->dBuffer4) CHKERRCUDA(cudaFree(mmdata->dBuffer4)); 20495f80ce2aSJacob Faibussowitsch if (mmdata->dBuffer5) CHKERRCUDA(cudaFree(mmdata->dBuffer5)); 2050b4285af6SJunchao Zhang #endif 20515f80ce2aSJacob Faibussowitsch if (mmdata->mmBuffer) CHKERRCUDA(cudaFree(mmdata->mmBuffer)); 20525f80ce2aSJacob Faibussowitsch if (mmdata->mmBuffer2) CHKERRCUDA(cudaFree(mmdata->mmBuffer2)); 2053afb2bd1cSJunchao Zhang #endif 20545f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&mmdata->X)); 20555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(data)); 2056ccdfe979SStefano Zampini PetscFunctionReturn(0); 2057ccdfe979SStefano Zampini } 2058ccdfe979SStefano Zampini 2059ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2060ccdfe979SStefano Zampini 2061ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2062ccdfe979SStefano Zampini { 2063ccdfe979SStefano Zampini Mat_Product *product = C->product; 2064ccdfe979SStefano Zampini Mat A,B; 2065afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2066ccdfe979SStefano Zampini PetscBool flg,biscuda; 2067ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2068ccdfe979SStefano Zampini cusparseStatus_t stat; 2069ccdfe979SStefano Zampini cusparseOperation_t opA; 2070ccdfe979SStefano Zampini const PetscScalar *barray; 2071ccdfe979SStefano Zampini PetscScalar *carray; 2072ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2073ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2074ccdfe979SStefano Zampini CsrMatrix *csrmat; 2075ccdfe979SStefano Zampini 2076ccdfe979SStefano Zampini PetscFunctionBegin; 2077ccdfe979SStefano Zampini MatCheckProduct(C,1); 2078*28b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2079ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2080ccdfe979SStefano Zampini A = product->A; 2081ccdfe979SStefano Zampini B = product->B; 20825f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2083*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2084ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2085ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 2086*28b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 20875f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 2088ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2089ccdfe979SStefano Zampini switch (product->type) { 2090ccdfe979SStefano Zampini case MATPRODUCT_AB: 2091ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2092ccdfe979SStefano Zampini mat = cusp->mat; 2093ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2094ccdfe979SStefano Zampini m = A->rmap->n; 2095ccdfe979SStefano Zampini n = B->cmap->n; 2096ccdfe979SStefano Zampini break; 2097ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20981a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2099e6e9a74fSStefano Zampini mat = cusp->mat; 2100e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2101e6e9a74fSStefano Zampini } else { 21025f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2103ccdfe979SStefano Zampini mat = cusp->matTranspose; 2104ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2105e6e9a74fSStefano Zampini } 2106ccdfe979SStefano Zampini m = A->cmap->n; 2107ccdfe979SStefano Zampini n = B->cmap->n; 2108ccdfe979SStefano Zampini break; 2109ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2110ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2111ccdfe979SStefano Zampini mat = cusp->mat; 2112ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2113ccdfe979SStefano Zampini m = A->rmap->n; 2114ccdfe979SStefano Zampini n = B->rmap->n; 2115ccdfe979SStefano Zampini break; 2116ccdfe979SStefano Zampini default: 211798921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2118ccdfe979SStefano Zampini } 2119*28b400f6SJacob Faibussowitsch PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2120ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2121ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 21225f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 21235f80ce2aSJacob Faibussowitsch if (!biscuda) CHKERRQ(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 21245f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDAGetArrayRead(B,&barray)); 2125afb2bd1cSJunchao Zhang 21265f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetLDA(B,&blda)); 2127c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 21285f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 21295f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetLDA(mmdata->X,&clda)); 2130c8378d12SStefano Zampini } else { 21315f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDAGetArrayWrite(C,&carray)); 21325f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseGetLDA(C,&clda)); 2133c8378d12SStefano Zampini } 2134c8378d12SStefano Zampini 21355f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 2136afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2137afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2138a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2139afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2140fcdce8c4SStefano Zampini size_t mmBufferSize; 21415f80ce2aSJacob Faibussowitsch if (mmdata->initialized && mmdata->Blda != blda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2142afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 21435f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2144afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2145afb2bd1cSJunchao Zhang } 2146c8378d12SStefano Zampini 21475f80ce2aSJacob Faibussowitsch if (mmdata->initialized && mmdata->Clda != clda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2148afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 21495f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2150afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2151afb2bd1cSJunchao Zhang } 2152afb2bd1cSJunchao Zhang 2153afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2154afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2155afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2156afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2157afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2158afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2159afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2160afb2bd1cSJunchao Zhang } 2161afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2162afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2163afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2164fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2165fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 21665f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(mmdata->mmBuffer)); 21675f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2168fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2169fcdce8c4SStefano Zampini } 2170afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2171afb2bd1cSJunchao Zhang } else { 2172afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 21735f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 21745f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 21755f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2176afb2bd1cSJunchao Zhang } 2177afb2bd1cSJunchao Zhang 2178afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2179afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2180afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2181afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2182fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2183afb2bd1cSJunchao Zhang #else 2184afb2bd1cSJunchao Zhang PetscInt k; 2185afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2186ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2187ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2188ccdfe979SStefano Zampini cublasStatus_t cerr; 2189ccdfe979SStefano Zampini 21905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle)); 2191ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2192ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2193ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2194ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2195ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2196ccdfe979SStefano Zampini blda = B->cmap->n; 2197afb2bd1cSJunchao Zhang k = B->cmap->n; 2198afb2bd1cSJunchao Zhang } else { 2199afb2bd1cSJunchao Zhang k = B->rmap->n; 2200ccdfe979SStefano Zampini } 2201ccdfe979SStefano Zampini 2202afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2203ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2204afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2205ccdfe979SStefano Zampini csrmat->values->data().get(), 2206ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2207ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2208ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2209ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2210afb2bd1cSJunchao Zhang #endif 22115f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 22125f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 22135f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDARestoreArrayRead(B,&barray)); 2214ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 22155f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22165f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2217ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 22185f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22195f80ce2aSJacob Faibussowitsch CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2220ccdfe979SStefano Zampini } else { 22215f80ce2aSJacob Faibussowitsch CHKERRQ(MatDenseCUDARestoreArrayWrite(C,&carray)); 2222ccdfe979SStefano Zampini } 2223ccdfe979SStefano Zampini if (mmdata->cisdense) { 22245f80ce2aSJacob Faibussowitsch CHKERRQ(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2225ccdfe979SStefano Zampini } 2226ccdfe979SStefano Zampini if (!biscuda) { 22275f80ce2aSJacob Faibussowitsch CHKERRQ(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2228ccdfe979SStefano Zampini } 2229ccdfe979SStefano Zampini PetscFunctionReturn(0); 2230ccdfe979SStefano Zampini } 2231ccdfe979SStefano Zampini 2232ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2233ccdfe979SStefano Zampini { 2234ccdfe979SStefano Zampini Mat_Product *product = C->product; 2235ccdfe979SStefano Zampini Mat A,B; 2236ccdfe979SStefano Zampini PetscInt m,n; 2237ccdfe979SStefano Zampini PetscBool cisdense,flg; 2238ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2239ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2240ccdfe979SStefano Zampini 2241ccdfe979SStefano Zampini PetscFunctionBegin; 2242ccdfe979SStefano Zampini MatCheckProduct(C,1); 2243*28b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2244ccdfe979SStefano Zampini A = product->A; 2245ccdfe979SStefano Zampini B = product->B; 22465f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2247*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2248ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 22492c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2250ccdfe979SStefano Zampini switch (product->type) { 2251ccdfe979SStefano Zampini case MATPRODUCT_AB: 2252ccdfe979SStefano Zampini m = A->rmap->n; 2253ccdfe979SStefano Zampini n = B->cmap->n; 2254ccdfe979SStefano Zampini break; 2255ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2256ccdfe979SStefano Zampini m = A->cmap->n; 2257ccdfe979SStefano Zampini n = B->cmap->n; 2258ccdfe979SStefano Zampini break; 2259ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2260ccdfe979SStefano Zampini m = A->rmap->n; 2261ccdfe979SStefano Zampini n = B->rmap->n; 2262ccdfe979SStefano Zampini break; 2263ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2264ccdfe979SStefano Zampini m = B->cmap->n; 2265ccdfe979SStefano Zampini n = B->cmap->n; 2266ccdfe979SStefano Zampini break; 2267ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2268ccdfe979SStefano Zampini m = B->rmap->n; 2269ccdfe979SStefano Zampini n = B->rmap->n; 2270ccdfe979SStefano Zampini break; 2271ccdfe979SStefano Zampini default: 227298921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2273ccdfe979SStefano Zampini } 22745f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(C,m,n,m,n)); 2275ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 22765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 22775f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(C,MATSEQDENSECUDA)); 2278ccdfe979SStefano Zampini 2279ccdfe979SStefano Zampini /* product data */ 22805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&mmdata)); 2281ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2282afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2283afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2284ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 22855f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2286ccdfe979SStefano Zampini } 2287afb2bd1cSJunchao Zhang #endif 2288ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2289ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 22905f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 22915f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2292ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 22935f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2294ccdfe979SStefano Zampini } else { 22955f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2296ccdfe979SStefano Zampini } 2297ccdfe979SStefano Zampini } 2298ccdfe979SStefano Zampini C->product->data = mmdata; 2299ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2300ccdfe979SStefano Zampini 2301ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2302ccdfe979SStefano Zampini PetscFunctionReturn(0); 2303ccdfe979SStefano Zampini } 2304ccdfe979SStefano Zampini 2305fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2306ccdfe979SStefano Zampini { 2307ccdfe979SStefano Zampini Mat_Product *product = C->product; 2308fcdce8c4SStefano Zampini Mat A,B; 2309fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2310fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2311fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2312fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2313fcdce8c4SStefano Zampini PetscBool flg; 2314fcdce8c4SStefano Zampini cusparseStatus_t stat; 2315fcdce8c4SStefano Zampini MatProductType ptype; 2316fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2317fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2319fcdce8c4SStefano Zampini #endif 2320b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2321ccdfe979SStefano Zampini 2322ccdfe979SStefano Zampini PetscFunctionBegin; 2323ccdfe979SStefano Zampini MatCheckProduct(C,1); 2324*28b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 23255f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2326*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2327fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2328fcdce8c4SStefano Zampini A = product->A; 2329fcdce8c4SStefano Zampini B = product->B; 2330fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2331fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2332fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 23332c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2334fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2335*28b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2336fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2337*28b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2338fcdce8c4SStefano Zampini goto finalize; 2339fcdce8c4SStefano Zampini } 2340fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 23415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2342*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 23435f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2344*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2345*28b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2346*28b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2347fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2348fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2349fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 23502c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 23512c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 23522c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 23535f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 23545f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 2355fcdce8c4SStefano Zampini 2356fcdce8c4SStefano Zampini ptype = product->type; 2357fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2358fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2359*28b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2360fa046f9fSJunchao Zhang } 2361fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2362fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2363*28b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2364fa046f9fSJunchao Zhang } 2365fcdce8c4SStefano Zampini switch (ptype) { 2366fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2367fcdce8c4SStefano Zampini Amat = Acusp->mat; 2368fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2369fcdce8c4SStefano Zampini break; 2370fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2371fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2372fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2373fcdce8c4SStefano Zampini break; 2374fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2375fcdce8c4SStefano Zampini Amat = Acusp->mat; 2376fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2377fcdce8c4SStefano Zampini break; 2378fcdce8c4SStefano Zampini default: 237998921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2380fcdce8c4SStefano Zampini } 2381fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2382*28b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2383*28b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2384*28b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2385fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2386fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2387fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2388*28b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2389*28b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2390*28b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 23915f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 2392fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2393fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 23945f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2395b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2396b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2397b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2398b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2399b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2400b4285af6SJunchao Zhang #else 2401b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2402fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2403fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2404fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2405b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2406fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2407fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2408b4285af6SJunchao Zhang #endif 2409fcdce8c4SStefano Zampini #else 2410b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2411fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2412fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2413fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2414fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2415fcdce8c4SStefano Zampini #endif 24165f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(mmdata->flops)); 24175f80ce2aSJacob Faibussowitsch CHKERRCUDA(WaitForCUDA()); 24185f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 2419fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2420fcdce8c4SStefano Zampini finalize: 2421fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 24225f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 24235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 24245f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2425fcdce8c4SStefano Zampini c->reallocs = 0; 2426fcdce8c4SStefano Zampini C->info.mallocs += 0; 2427fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2428fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2429fcdce8c4SStefano Zampini C->num_ass++; 2430ccdfe979SStefano Zampini PetscFunctionReturn(0); 2431ccdfe979SStefano Zampini } 2432fcdce8c4SStefano Zampini 2433fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2434fcdce8c4SStefano Zampini { 2435fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2436fcdce8c4SStefano Zampini Mat A,B; 2437fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2438fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2439fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2440fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2441fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2442fcdce8c4SStefano Zampini PetscBool flg; 2443fcdce8c4SStefano Zampini cusparseStatus_t stat; 2444fcdce8c4SStefano Zampini MatProductType ptype; 2445fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2446fcdce8c4SStefano Zampini PetscLogDouble flops; 2447fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2448fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2449fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2450fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2451fcdce8c4SStefano Zampini #else 2452fcdce8c4SStefano Zampini int cnz; 2453fcdce8c4SStefano Zampini #endif 2454b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2455fcdce8c4SStefano Zampini 2456fcdce8c4SStefano Zampini PetscFunctionBegin; 2457fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2458*28b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2459fcdce8c4SStefano Zampini A = product->A; 2460fcdce8c4SStefano Zampini B = product->B; 24615f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2462*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 24635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2464*28b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2465fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2466fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2467fcdce8c4SStefano Zampini /* product data */ 24685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&mmdata)); 2469fcdce8c4SStefano Zampini C->product->data = mmdata; 2470fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2471fcdce8c4SStefano Zampini 24725f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 24735f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 2474d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2475d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 24762c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 24772c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2478d60bce21SJunchao Zhang 2479fcdce8c4SStefano Zampini ptype = product->type; 2480fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2481fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2482fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2483fa046f9fSJunchao Zhang } 2484fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2485fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2486fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2487fa046f9fSJunchao Zhang } 2488fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2489fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2490fcdce8c4SStefano Zampini switch (ptype) { 2491fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2492fcdce8c4SStefano Zampini m = A->rmap->n; 2493fcdce8c4SStefano Zampini n = B->cmap->n; 2494fcdce8c4SStefano Zampini k = A->cmap->n; 2495fcdce8c4SStefano Zampini Amat = Acusp->mat; 2496fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2497fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2498fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2499fcdce8c4SStefano Zampini break; 2500fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2501fcdce8c4SStefano Zampini m = A->cmap->n; 2502fcdce8c4SStefano Zampini n = B->cmap->n; 2503fcdce8c4SStefano Zampini k = A->rmap->n; 25045f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2505fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2506fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2507fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2508fcdce8c4SStefano Zampini break; 2509fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2510fcdce8c4SStefano Zampini m = A->rmap->n; 2511fcdce8c4SStefano Zampini n = B->rmap->n; 2512fcdce8c4SStefano Zampini k = A->cmap->n; 25135f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2514fcdce8c4SStefano Zampini Amat = Acusp->mat; 2515fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2516fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2517fcdce8c4SStefano Zampini break; 2518fcdce8c4SStefano Zampini default: 251998921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2520fcdce8c4SStefano Zampini } 2521fcdce8c4SStefano Zampini 2522fcdce8c4SStefano Zampini /* create cusparse matrix */ 25235f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(C,m,n,m,n)); 25245f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(C,MATSEQAIJCUSPARSE)); 2525fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2526fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2527fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2528fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2529fcdce8c4SStefano Zampini 2530fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2531fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2532fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 25335f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 25345f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2535fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2536fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2537fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2538fcdce8c4SStefano Zampini } else { 2539fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2540fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2541fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2542fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2543fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2544fcdce8c4SStefano Zampini } 2545fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2546fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2547fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2548fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2549fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2550fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 25515f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 25525f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 25535f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 25545f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 25555f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 25565f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 25575f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25585f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25595f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2560fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2561fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2562fcdce8c4SStefano Zampini c->nz = 0; 2563fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2564fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2565fcdce8c4SStefano Zampini goto finalizesym; 2566fcdce8c4SStefano Zampini } 2567fcdce8c4SStefano Zampini 2568*28b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2569*28b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2570fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2571fcdce8c4SStefano Zampini if (!biscompressed) { 2572fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2573fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2574fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2575fcdce8c4SStefano Zampini #endif 2576fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2577fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2578fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2579fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2580fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2581fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2582fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2583fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2584fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2585fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2586fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 25875f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2588fcdce8c4SStefano Zampini } 2589fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2590fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2591fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2592fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2593fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2594fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2595fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2596fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2597fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2598fcdce8c4SStefano Zampini } 2599fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2600fcdce8c4SStefano Zampini #endif 2601fcdce8c4SStefano Zampini } 2602*28b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2603*28b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2604fcdce8c4SStefano Zampini /* precompute flops count */ 2605fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2606fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2607fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2608fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2609fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2610fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2611fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2612fcdce8c4SStefano Zampini } 2613fcdce8c4SStefano Zampini } 2614fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2615fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2616fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2617fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2618fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2619fcdce8c4SStefano Zampini } 2620fcdce8c4SStefano Zampini } else { /* TODO */ 2621fcdce8c4SStefano Zampini flops = 0.; 2622fcdce8c4SStefano Zampini } 2623fcdce8c4SStefano Zampini 2624fcdce8c4SStefano Zampini mmdata->flops = flops; 26255f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 2626b4285af6SJunchao Zhang 2627fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 26285f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2629fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2630fcdce8c4SStefano Zampini NULL, NULL, NULL, 2631fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2632fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 26335f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2634b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2635b4285af6SJunchao Zhang { 2636b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2637b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2638b4285af6SJunchao Zhang */ 2639b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2640b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2641b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2642b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2643b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2644b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2645b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2646b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2647b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2648b4285af6SJunchao Zhang 2649b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2650b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2651b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2652b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2653b4285af6SJunchao Zhang &bufferSize1, NULL);CHKERRCUSPARSE(stat); 26545f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2655b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2656b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2657b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2658b4285af6SJunchao Zhang &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2659b4285af6SJunchao Zhang 2660b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2661b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2662b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2663b4285af6SJunchao Zhang &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 26645f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 26655f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 26665f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2667b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2668b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2669b4285af6SJunchao Zhang &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 26705f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(dBuffer1)); 26715f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(dBuffer2)); 2672b4285af6SJunchao Zhang 2673b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2674b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 26755f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2676b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2677b4285af6SJunchao Zhang /* allocate matrix C */ 2678b4285af6SJunchao Zhang Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2679b4285af6SJunchao Zhang Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2680b4285af6SJunchao Zhang /* update matC with the new pointers */ 2681b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2682b4285af6SJunchao Zhang Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2683b4285af6SJunchao Zhang 2684b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2685b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2686b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2687b4285af6SJunchao Zhang &bufferSize5, NULL);CHKERRCUSPARSE(stat); 26885f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2689b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2690b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2691b4285af6SJunchao Zhang &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 26925f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(dBuffer3)); 2693b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2694b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2695b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2696b4285af6SJunchao Zhang mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 26975f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2698b4285af6SJunchao Zhang } 2699ae37ee31SJunchao Zhang #else 2700b4285af6SJunchao Zhang size_t bufSize2; 2701fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2702b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2703fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2704fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2705fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 27065f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2707fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2708b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2709fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2710fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2711fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2712fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2713b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2714fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2715fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2716fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2717fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2718fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2719fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2720fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2721fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 27225f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2723fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2724b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2725fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2726fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2727fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2728fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 27295f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2730fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 27315f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2732fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2733fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2734fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2735fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2736fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2737fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2738b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2739fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2740fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2741ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2742fcdce8c4SStefano Zampini #else 27435f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2744b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2745fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2746fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2747fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2748fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2749fcdce8c4SStefano Zampini c->nz = cnz; 2750fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2751fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2752fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2753fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2754fcdce8c4SStefano Zampini 27555f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2756fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2757fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2758fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2759b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2760fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2761fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2762fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2763fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2764fcdce8c4SStefano Zampini #endif 27655f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(mmdata->flops)); 27665f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 2767fcdce8c4SStefano Zampini finalizesym: 2768fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2769fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2770fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 27715f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m+1,&c->i)); 27725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(c->nz,&c->j)); 2773fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2774fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2775fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2776fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2777fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2778fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2779fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27805f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27815f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2782fcdce8c4SStefano Zampini } else { 2783fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2784fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27855f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27865f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2787fcdce8c4SStefano Zampini } 2788fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2789fcdce8c4SStefano Zampini PetscInt r = 0; 2790fcdce8c4SStefano Zampini c->i[0] = 0; 2791fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2792fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2793fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2794fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2795fcdce8c4SStefano Zampini } 2796fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2797fcdce8c4SStefano Zampini } 27985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 27995f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&c->ilen)); 28005f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&c->imax)); 2801fcdce8c4SStefano Zampini c->maxnz = c->nz; 2802fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2803fcdce8c4SStefano Zampini c->rmax = 0; 2804fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2805fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2806fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2807fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2808fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2809fcdce8c4SStefano Zampini } 28105f80ce2aSJacob Faibussowitsch CHKERRQ(MatMarkDiagonal_SeqAIJ(C)); 28115f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(c->nz,&c->a)); 2812fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2813fcdce8c4SStefano Zampini 2814fcdce8c4SStefano Zampini C->nonzerostate++; 28155f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp(C->rmap)); 28165f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp(C->cmap)); 2817fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2818fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2819fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2820fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2821fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2822abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2823fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2824fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2825fcdce8c4SStefano Zampini } 2826fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2827fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2828fcdce8c4SStefano Zampini } 2829fcdce8c4SStefano Zampini 2830fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2831fcdce8c4SStefano Zampini 2832fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2833fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2834fcdce8c4SStefano Zampini { 2835fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2836fcdce8c4SStefano Zampini PetscErrorCode ierr; 2837fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2838fcdce8c4SStefano Zampini 2839fcdce8c4SStefano Zampini PetscFunctionBegin; 2840fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 28415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2842abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 28435f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2844fcdce8c4SStefano Zampini } 2845fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2846fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2847fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 28485f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2849fcdce8c4SStefano Zampini } 2850fcdce8c4SStefano Zampini } 285165e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 285265e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 285365e4b4d4SStefano Zampini switch (product->type) { 285465e4b4d4SStefano Zampini case MATPRODUCT_AB: 285565e4b4d4SStefano Zampini if (product->api_user) { 285665e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 28575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 285865e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 285965e4b4d4SStefano Zampini } else { 286065e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 28615f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 286265e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 286365e4b4d4SStefano Zampini } 286465e4b4d4SStefano Zampini break; 286565e4b4d4SStefano Zampini case MATPRODUCT_AtB: 286665e4b4d4SStefano Zampini if (product->api_user) { 286765e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 28685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 286965e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 287065e4b4d4SStefano Zampini } else { 287165e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 28725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 287365e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 287465e4b4d4SStefano Zampini } 287565e4b4d4SStefano Zampini break; 287665e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 287765e4b4d4SStefano Zampini if (product->api_user) { 287865e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 28795f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 288065e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 288165e4b4d4SStefano Zampini } else { 288265e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 28835f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 288465e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 288565e4b4d4SStefano Zampini } 288665e4b4d4SStefano Zampini break; 288765e4b4d4SStefano Zampini case MATPRODUCT_RARt: 288865e4b4d4SStefano Zampini if (product->api_user) { 288965e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 28905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 289165e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 289265e4b4d4SStefano Zampini } else { 289365e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 28945f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 289565e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 289665e4b4d4SStefano Zampini } 289765e4b4d4SStefano Zampini break; 289865e4b4d4SStefano Zampini case MATPRODUCT_ABC: 289965e4b4d4SStefano Zampini if (product->api_user) { 290065e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 29015f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 290265e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 290365e4b4d4SStefano Zampini } else { 290465e4b4d4SStefano Zampini ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 29055f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 290665e4b4d4SStefano Zampini ierr = PetscOptionsEnd();CHKERRQ(ierr); 290765e4b4d4SStefano Zampini } 290865e4b4d4SStefano Zampini break; 290965e4b4d4SStefano Zampini default: 291065e4b4d4SStefano Zampini break; 291165e4b4d4SStefano Zampini } 291265e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 291365e4b4d4SStefano Zampini } 291465e4b4d4SStefano Zampini /* dispatch */ 2915fcdce8c4SStefano Zampini if (isdense) { 2916ccdfe979SStefano Zampini switch (product->type) { 2917ccdfe979SStefano Zampini case MATPRODUCT_AB: 2918ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2919ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2920ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2921ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2922fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 29235f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2924fcdce8c4SStefano Zampini } else { 2925fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2926fcdce8c4SStefano Zampini } 2927fcdce8c4SStefano Zampini break; 2928fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2929fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2930fcdce8c4SStefano Zampini break; 2931ccdfe979SStefano Zampini default: 2932ccdfe979SStefano Zampini break; 2933ccdfe979SStefano Zampini } 2934fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2935fcdce8c4SStefano Zampini switch (product->type) { 2936fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2937fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2938fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2939fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2940fcdce8c4SStefano Zampini break; 2941fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2942fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2943fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2944fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2945fcdce8c4SStefano Zampini break; 2946fcdce8c4SStefano Zampini default: 2947fcdce8c4SStefano Zampini break; 2948fcdce8c4SStefano Zampini } 2949fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 29505f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat)); 2951fcdce8c4SStefano Zampini } 2952ccdfe979SStefano Zampini PetscFunctionReturn(0); 2953ccdfe979SStefano Zampini } 2954ccdfe979SStefano Zampini 29556fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 29569ae82921SPaul Mullowney { 29579ae82921SPaul Mullowney PetscFunctionBegin; 29585f80ce2aSJacob Faibussowitsch CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2959e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2960e6e9a74fSStefano Zampini } 2961e6e9a74fSStefano Zampini 2962e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2963e6e9a74fSStefano Zampini { 2964e6e9a74fSStefano Zampini PetscFunctionBegin; 29655f80ce2aSJacob Faibussowitsch CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2966e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2967e6e9a74fSStefano Zampini } 2968e6e9a74fSStefano Zampini 2969e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2970e6e9a74fSStefano Zampini { 2971e6e9a74fSStefano Zampini PetscFunctionBegin; 29725f80ce2aSJacob Faibussowitsch CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2973e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2974e6e9a74fSStefano Zampini } 2975e6e9a74fSStefano Zampini 2976e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2977e6e9a74fSStefano Zampini { 2978e6e9a74fSStefano Zampini PetscFunctionBegin; 29795f80ce2aSJacob Faibussowitsch CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 29809ae82921SPaul Mullowney PetscFunctionReturn(0); 29819ae82921SPaul Mullowney } 29829ae82921SPaul Mullowney 29836fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2984ca45077fSPaul Mullowney { 2985ca45077fSPaul Mullowney PetscFunctionBegin; 29865f80ce2aSJacob Faibussowitsch CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2987ca45077fSPaul Mullowney PetscFunctionReturn(0); 2988ca45077fSPaul Mullowney } 2989ca45077fSPaul Mullowney 2990a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2991a0e72f99SJunchao Zhang { 2992a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2993a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2994a0e72f99SJunchao Zhang } 2995a0e72f99SJunchao Zhang 2996afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2997e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 29989ae82921SPaul Mullowney { 29999ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3000aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 30019ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3002e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3003e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3004e6e9a74fSStefano Zampini PetscBool compressed; 3005afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3006afb2bd1cSJunchao Zhang PetscInt nx,ny; 3007afb2bd1cSJunchao Zhang #endif 30086e111a19SKarl Rupp 30099ae82921SPaul Mullowney PetscFunctionBegin; 30102c71b3e2SJacob Faibussowitsch PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3011cbc6b225SStefano Zampini if (!a->nz) { 30125f80ce2aSJacob Faibussowitsch if (!yy) CHKERRQ(VecSet_SeqCUDA(zz,0)); 30135f80ce2aSJacob Faibussowitsch else CHKERRQ(VecCopy_SeqCUDA(yy,zz)); 3014e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3015e6e9a74fSStefano Zampini } 301634d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 30175f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 3018e6e9a74fSStefano Zampini if (!trans) { 30199ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 30205f80ce2aSJacob Faibussowitsch PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3021e6e9a74fSStefano Zampini } else { 30221a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3023e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3024e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3025e6e9a74fSStefano Zampini } else { 30265f80ce2aSJacob Faibussowitsch if (!cusparsestruct->matTranspose) CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3027e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3028e6e9a74fSStefano Zampini } 3029e6e9a74fSStefano Zampini } 3030e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3031e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3032213423ffSJunchao Zhang 3033e6e9a74fSStefano Zampini try { 30345f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 30355f80ce2aSJacob Faibussowitsch if (yy == zz) CHKERRQ(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 30365f80ce2aSJacob Faibussowitsch else CHKERRQ(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3037afb2bd1cSJunchao Zhang 30385f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 3039e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3040afb2bd1cSJunchao Zhang /* z = A x + beta y. 3041afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3042afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3043afb2bd1cSJunchao Zhang */ 3044e6e9a74fSStefano Zampini xptr = xarray; 3045afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3046213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3047afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3048afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3049afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3050afb2bd1cSJunchao Zhang */ 3051afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3052afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3053afb2bd1cSJunchao Zhang nx = mat->num_cols; 3054afb2bd1cSJunchao Zhang ny = mat->num_rows; 3055afb2bd1cSJunchao Zhang } 3056afb2bd1cSJunchao Zhang #endif 3057e6e9a74fSStefano Zampini } else { 3058afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3059afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3060afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3061afb2bd1cSJunchao Zhang */ 3062afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3063e6e9a74fSStefano Zampini dptr = zarray; 3064e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3065afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3066e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3067a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3068e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3069e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3070e6e9a74fSStefano Zampini } 3071afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3072afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3073afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3074afb2bd1cSJunchao Zhang nx = mat->num_rows; 3075afb2bd1cSJunchao Zhang ny = mat->num_cols; 3076afb2bd1cSJunchao Zhang } 3077afb2bd1cSJunchao Zhang #endif 3078e6e9a74fSStefano Zampini } 30799ae82921SPaul Mullowney 3080afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3081aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3082afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 30835f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3084afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 30855f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 30865f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 30875f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3088afb2bd1cSJunchao Zhang matstruct->matDescr, 3089afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3090afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3091afb2bd1cSJunchao Zhang cusparse_scalartype, 3092afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 30935f80ce2aSJacob Faibussowitsch &matstruct->cuSpMV[opA].spmvBufferSize)); 30945f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3095afb2bd1cSJunchao Zhang 3096afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3097afb2bd1cSJunchao Zhang } else { 3098afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 30995f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 31005f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3101afb2bd1cSJunchao Zhang } 3102afb2bd1cSJunchao Zhang 31035f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3104afb2bd1cSJunchao Zhang matstruct->alpha_one, 31053606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3106afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3107afb2bd1cSJunchao Zhang beta, 3108afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3109afb2bd1cSJunchao Zhang cusparse_scalartype, 3110afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 31115f80ce2aSJacob Faibussowitsch matstruct->cuSpMV[opA].spmvBuffer)); 3112afb2bd1cSJunchao Zhang #else 31137656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 31145f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3115a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3116afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3117aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3118e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 31195f80ce2aSJacob Faibussowitsch dptr)); 3120afb2bd1cSJunchao Zhang #endif 3121aa372e3fSPaul Mullowney } else { 3122213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3123afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3124afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3125afb2bd1cSJunchao Zhang #else 3126301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 31275f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3128afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3129e6e9a74fSStefano Zampini xptr, beta, 31305f80ce2aSJacob Faibussowitsch dptr)); 3131afb2bd1cSJunchao Zhang #endif 3132a65300a6SPaul Mullowney } 3133aa372e3fSPaul Mullowney } 31345f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3135aa372e3fSPaul Mullowney 3136e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3137213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3138213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 31395f80ce2aSJacob Faibussowitsch CHKERRQ(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3140e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 31415f80ce2aSJacob Faibussowitsch CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 31427656d835SStefano Zampini } 3143213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 31445f80ce2aSJacob Faibussowitsch CHKERRQ(VecSet_SeqCUDA(zz,0)); 31457656d835SStefano Zampini } 31467656d835SStefano Zampini 3147213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3148213423ffSJunchao Zhang if (compressed) { 31495f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 3150a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3151a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3152a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3153a0e72f99SJunchao Zhang */ 3154a0e72f99SJunchao Zhang #if 0 3155a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3156a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3157a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3158e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3159c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3160a0e72f99SJunchao Zhang #else 3161a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3162a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3163a0e72f99SJunchao Zhang #endif 31645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3165e6e9a74fSStefano Zampini } 3166e6e9a74fSStefano Zampini } else { 3167e6e9a74fSStefano Zampini if (yy && yy != zz) { 31685f80ce2aSJacob Faibussowitsch CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3169e6e9a74fSStefano Zampini } 3170e6e9a74fSStefano Zampini } 31715f80ce2aSJacob Faibussowitsch CHKERRQ(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 31725f80ce2aSJacob Faibussowitsch if (yy == zz) CHKERRQ(VecCUDARestoreArray(zz,&zarray)); 31735f80ce2aSJacob Faibussowitsch else CHKERRQ(VecCUDARestoreArrayWrite(zz,&zarray)); 31749ae82921SPaul Mullowney } catch(char *ex) { 317598921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 31769ae82921SPaul Mullowney } 3177e6e9a74fSStefano Zampini if (yy) { 31785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*a->nz)); 3179e6e9a74fSStefano Zampini } else { 31805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3181e6e9a74fSStefano Zampini } 31829ae82921SPaul Mullowney PetscFunctionReturn(0); 31839ae82921SPaul Mullowney } 31849ae82921SPaul Mullowney 31856fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3186ca45077fSPaul Mullowney { 3187ca45077fSPaul Mullowney PetscFunctionBegin; 31885f80ce2aSJacob Faibussowitsch CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3189ca45077fSPaul Mullowney PetscFunctionReturn(0); 3190ca45077fSPaul Mullowney } 3191ca45077fSPaul Mullowney 31926fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 31939ae82921SPaul Mullowney { 3194042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3195042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 31963fa6b06aSMark Adams 3197042217e8SBarry Smith PetscFunctionBegin; 31985f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode)); 3199042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3200042217e8SBarry Smith 32015f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 32025f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusp->deviceMat)); 3203042217e8SBarry Smith cusp->deviceMat = NULL; 3204042217e8SBarry Smith } 32059ae82921SPaul Mullowney PetscFunctionReturn(0); 32069ae82921SPaul Mullowney } 32079ae82921SPaul Mullowney 32089ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3209e057df02SPaul Mullowney /*@ 32109ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3211e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3212e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3213e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3214e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3215e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32169ae82921SPaul Mullowney 3217d083f849SBarry Smith Collective 32189ae82921SPaul Mullowney 32199ae82921SPaul Mullowney Input Parameters: 32209ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32219ae82921SPaul Mullowney . m - number of rows 32229ae82921SPaul Mullowney . n - number of columns 32239ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32249ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32250298fd71SBarry Smith (possibly different for each row) or NULL 32269ae82921SPaul Mullowney 32279ae82921SPaul Mullowney Output Parameter: 32289ae82921SPaul Mullowney . A - the matrix 32299ae82921SPaul Mullowney 32309ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 32319ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 32329ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 32339ae82921SPaul Mullowney 32349ae82921SPaul Mullowney Notes: 32359ae82921SPaul Mullowney If nnz is given then nz is ignored 32369ae82921SPaul Mullowney 32379ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 32389ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 32399ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 32409ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 32419ae82921SPaul Mullowney 32429ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 32430298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 32449ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 32459ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 32469ae82921SPaul Mullowney 32479ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 32489ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 32499ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 32509ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 32519ae82921SPaul Mullowney 32529ae82921SPaul Mullowney Level: intermediate 32539ae82921SPaul Mullowney 3254e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 32559ae82921SPaul Mullowney @*/ 32569ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 32579ae82921SPaul Mullowney { 32589ae82921SPaul Mullowney PetscFunctionBegin; 32595f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(comm,A)); 32605f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*A,m,n,m,n)); 32615f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE)); 32625f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 32639ae82921SPaul Mullowney PetscFunctionReturn(0); 32649ae82921SPaul Mullowney } 32659ae82921SPaul Mullowney 32666fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 32679ae82921SPaul Mullowney { 32689ae82921SPaul Mullowney PetscFunctionBegin; 32699ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 32705f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 32719ae82921SPaul Mullowney } else { 32725f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3273aa372e3fSPaul Mullowney } 32745f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 32755f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 32765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 32775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 32785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 32795f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 32805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 32815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 32825f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 32835f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 32845f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy_SeqAIJ(A)); 32859ae82921SPaul Mullowney PetscFunctionReturn(0); 32869ae82921SPaul Mullowney } 32879ae82921SPaul Mullowney 3288ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 328995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 32909ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 32919ff858a8SKarl Rupp { 32929ff858a8SKarl Rupp PetscFunctionBegin; 32935f80ce2aSJacob Faibussowitsch CHKERRQ(MatDuplicate_SeqAIJ(A,cpvalues,B)); 32945f80ce2aSJacob Faibussowitsch CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 32959ff858a8SKarl Rupp PetscFunctionReturn(0); 32969ff858a8SKarl Rupp } 32979ff858a8SKarl Rupp 3298039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 329995639643SRichard Tran Mills { 3300a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3301039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3302039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3303039c6fbaSStefano Zampini PetscScalar *ay; 3304039c6fbaSStefano Zampini const PetscScalar *ax; 3305039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3306e6e9a74fSStefano Zampini 330795639643SRichard Tran Mills PetscFunctionBegin; 3308a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3309a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3310039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 33115f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33125f80ce2aSJacob Faibussowitsch CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str)); 3313a587d139SMark PetscFunctionReturn(0); 331495639643SRichard Tran Mills } 3315039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 33165f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(Y)); 33175f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(X)); 33185f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 33195f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3320039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3321039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3322039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3323039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3324039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3325039c6fbaSStefano Zampini if (eq) { 3326039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3327039c6fbaSStefano Zampini } 3328039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3329039c6fbaSStefano Zampini } 3330d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3331d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3332039c6fbaSStefano Zampini 3333039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3334039c6fbaSStefano Zampini PetscScalar b = 1.0; 3335039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3336039c6fbaSStefano Zampini size_t bufferSize; 3337039c6fbaSStefano Zampini void *buffer; 3338039c6fbaSStefano Zampini #endif 3339039c6fbaSStefano Zampini 33405f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33415f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33425f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3343039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33445f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3345039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3346039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33475f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 33485f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc(&buffer,bufferSize)); 33495f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 33505f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3351039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3352039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33535f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 33545f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(x->nz + y->nz)); 33555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 33565f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(buffer)); 3357039c6fbaSStefano Zampini #else 33585f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 33595f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3360039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3361039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33625f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 33635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(x->nz + y->nz)); 33645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3365039c6fbaSStefano Zampini #endif 33665f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 33675f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33685f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33695f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(Y)); 3370039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3371a587d139SMark cublasHandle_t cublasv2handle; 3372a587d139SMark PetscBLASInt one = 1, bnz = 1; 3373039c6fbaSStefano Zampini 33745f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33755f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33765f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle)); 33775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscBLASIntCast(x->nz,&bnz)); 33785f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 33795f80ce2aSJacob Faibussowitsch CHKERRCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 33805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*bnz)); 33815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 33825f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33835f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33845f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(Y)); 3385039c6fbaSStefano Zampini } else { 33865f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33875f80ce2aSJacob Faibussowitsch CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str)); 3388a587d139SMark } 338995639643SRichard Tran Mills PetscFunctionReturn(0); 339095639643SRichard Tran Mills } 339195639643SRichard Tran Mills 339233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 339333c9ba73SStefano Zampini { 339433c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 339533c9ba73SStefano Zampini PetscScalar *ay; 339633c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 339733c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 339833c9ba73SStefano Zampini 339933c9ba73SStefano Zampini PetscFunctionBegin; 34005f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 34015f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle)); 34025f80ce2aSJacob Faibussowitsch CHKERRQ(PetscBLASIntCast(y->nz,&bnz)); 34035f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 34045f80ce2aSJacob Faibussowitsch CHKERRCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 34055f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(bnz)); 34065f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 34075f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 34085f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(Y)); 340933c9ba73SStefano Zampini PetscFunctionReturn(0); 341033c9ba73SStefano Zampini } 341133c9ba73SStefano Zampini 34123fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34133fa6b06aSMark Adams { 34147e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3415a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34167e8381f9SStefano Zampini 34173fa6b06aSMark Adams PetscFunctionBegin; 34183fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 34193fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 34207e8381f9SStefano Zampini if (spptr->mat) { 34217e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 34227e8381f9SStefano Zampini if (matrix->values) { 34237e8381f9SStefano Zampini both = PETSC_TRUE; 34247e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34257e8381f9SStefano Zampini } 34267e8381f9SStefano Zampini } 34277e8381f9SStefano Zampini if (spptr->matTranspose) { 34287e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 34297e8381f9SStefano Zampini if (matrix->values) { 34307e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34317e8381f9SStefano Zampini } 34327e8381f9SStefano Zampini } 34333fa6b06aSMark Adams } 34345f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(a->a,a->i[A->rmap->n])); 34355f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 34367e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3437a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 34383fa6b06aSMark Adams PetscFunctionReturn(0); 34393fa6b06aSMark Adams } 34403fa6b06aSMark Adams 3441a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3442a587d139SMark { 3443a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3444a587d139SMark 3445a587d139SMark PetscFunctionBegin; 34469a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 34479a14fc28SStefano Zampini A->boundtocpu = flg; 34489a14fc28SStefano Zampini PetscFunctionReturn(0); 34499a14fc28SStefano Zampini } 3450a587d139SMark if (flg) { 34515f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 3452a587d139SMark 345333c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3454a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3455a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3456a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3457a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3458a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3459a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3460a587d139SMark A->ops->multhermitiantranspose = NULL; 3461a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3462fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 34635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 34645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 34655f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 34665f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 34675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 34685f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 34695f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 34705f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3471a587d139SMark } else { 347233c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3473a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3474a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3475a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3476a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3477a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3478a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3479a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3480a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3481fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 348267a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 348367a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 348467a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 348567a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 348667a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 348767a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 34885f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 34895f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34915f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 34925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 34935f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3494a587d139SMark } 3495a587d139SMark A->boundtocpu = flg; 3496ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3497ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3498ea500dcfSRichard Tran Mills } else { 3499ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3500ea500dcfSRichard Tran Mills } 3501a587d139SMark PetscFunctionReturn(0); 3502a587d139SMark } 3503a587d139SMark 350449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 35059ae82921SPaul Mullowney { 350649735bf3SStefano Zampini Mat B; 35079ae82921SPaul Mullowney 35089ae82921SPaul Mullowney PetscFunctionBegin; 35095f80ce2aSJacob Faibussowitsch CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 351049735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 35115f80ce2aSJacob Faibussowitsch CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 351249735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 35135f80ce2aSJacob Faibussowitsch CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 351449735bf3SStefano Zampini } 351549735bf3SStefano Zampini B = *newmat; 351649735bf3SStefano Zampini 35175f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(B->defaultvectype)); 35185f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 351934136279SStefano Zampini 352049735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 35219ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3522e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 35235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&spptr)); 35245f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreate(&spptr->handle)); 35255f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 35261a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3527d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 35288efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3529a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3530a435da06SStefano Zampini #else 3531d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3532a435da06SStefano Zampini #endif 3533d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3534d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3535d8132acaSStefano Zampini #endif 35361a2c6b5cSJunchao Zhang B->spptr = spptr; 35379ae82921SPaul Mullowney } else { 3538e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3539e6e9a74fSStefano Zampini 35405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&spptr)); 35415f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreate(&spptr->handle)); 35425f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3543e6e9a74fSStefano Zampini B->spptr = spptr; 35449ae82921SPaul Mullowney } 3545e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 354649735bf3SStefano Zampini } 3547693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 35489ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 35491a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 35509ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 355195639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3552693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 35532205254eSKarl Rupp 35545f80ce2aSJacob Faibussowitsch CHKERRQ(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 35555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 35565f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3557ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 35585f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3559ae48a8d0SStefano Zampini #endif 35605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 35619ae82921SPaul Mullowney PetscFunctionReturn(0); 35629ae82921SPaul Mullowney } 35639ae82921SPaul Mullowney 356402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 356502fe1965SBarry Smith { 356602fe1965SBarry Smith PetscFunctionBegin; 35675f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate_SeqAIJ(B)); 35685f80ce2aSJacob Faibussowitsch CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 356902fe1965SBarry Smith PetscFunctionReturn(0); 357002fe1965SBarry Smith } 357102fe1965SBarry Smith 35723ca39a21SBarry Smith /*MC 3573e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3574e057df02SPaul Mullowney 3575e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 35762692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 35772692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3578e057df02SPaul Mullowney 3579e057df02SPaul Mullowney Options Database Keys: 3580e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3581aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3582a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3583365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3584e057df02SPaul Mullowney 3585e057df02SPaul Mullowney Level: beginner 3586e057df02SPaul Mullowney 35878468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3588e057df02SPaul Mullowney M*/ 35897f756511SDominic Meiser 3590bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 35910f39cd5aSBarry Smith 35923ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 359342c9c57cSBarry Smith { 359442c9c57cSBarry Smith PetscFunctionBegin; 35955f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 35965f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 35975f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 35985f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 35995f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3600bddcd29dSMark Adams 360142c9c57cSBarry Smith PetscFunctionReturn(0); 360242c9c57cSBarry Smith } 360329b38603SBarry Smith 3604cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3605cbc6b225SStefano Zampini { 3606cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3607cbc6b225SStefano Zampini cudaError_t cerr; 3608cbc6b225SStefano Zampini 3609cbc6b225SStefano Zampini PetscFunctionBegin; 3610cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3611cbc6b225SStefano Zampini delete cusp->cooPerm; 3612cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3613cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3614cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3615cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 36165f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusp->jmap_d)); 36175f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusp->perm_d)); 3618cbc6b225SStefano Zampini } 3619cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3620cbc6b225SStefano Zampini PetscFunctionReturn(0); 3621cbc6b225SStefano Zampini } 3622cbc6b225SStefano Zampini 3623470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 36247f756511SDominic Meiser { 36257f756511SDominic Meiser PetscFunctionBegin; 36267f756511SDominic Meiser if (*cusparsestruct) { 36275f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 36285f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 36297f756511SDominic Meiser delete (*cusparsestruct)->workVector; 363081902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 36317e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 36327e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3633a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 36345f80ce2aSJacob Faibussowitsch if ((*cusparsestruct)->handle) CHKERRCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 36355f80ce2aSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) CHKERRCUDA(cudaFree((*cusparsestruct)->jmap_d)); 36365f80ce2aSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) CHKERRCUDA(cudaFree((*cusparsestruct)->perm_d)); 36375f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(*cusparsestruct)); 36387f756511SDominic Meiser } 36397f756511SDominic Meiser PetscFunctionReturn(0); 36407f756511SDominic Meiser } 36417f756511SDominic Meiser 36427f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 36437f756511SDominic Meiser { 36447f756511SDominic Meiser PetscFunctionBegin; 36457f756511SDominic Meiser if (*mat) { 36467f756511SDominic Meiser delete (*mat)->values; 36477f756511SDominic Meiser delete (*mat)->column_indices; 36487f756511SDominic Meiser delete (*mat)->row_offsets; 36497f756511SDominic Meiser delete *mat; 36507f756511SDominic Meiser *mat = 0; 36517f756511SDominic Meiser } 36527f756511SDominic Meiser PetscFunctionReturn(0); 36537f756511SDominic Meiser } 36547f756511SDominic Meiser 3655470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 36567f756511SDominic Meiser { 36577f756511SDominic Meiser PetscFunctionBegin; 36587f756511SDominic Meiser if (*trifactor) { 36595f80ce2aSJacob Faibussowitsch if ((*trifactor)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 36605f80ce2aSJacob Faibussowitsch if ((*trifactor)->solveInfo) CHKERRCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 36615f80ce2aSJacob Faibussowitsch CHKERRQ(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 36625f80ce2aSJacob Faibussowitsch if ((*trifactor)->solveBuffer) CHKERRCUDA(cudaFree((*trifactor)->solveBuffer)); 36635f80ce2aSJacob Faibussowitsch if ((*trifactor)->AA_h) CHKERRCUDA(cudaFreeHost((*trifactor)->AA_h)); 3664afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 36655f80ce2aSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) CHKERRCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3666afb2bd1cSJunchao Zhang #endif 36675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(*trifactor)); 36687f756511SDominic Meiser } 36697f756511SDominic Meiser PetscFunctionReturn(0); 36707f756511SDominic Meiser } 36717f756511SDominic Meiser 3672470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 36737f756511SDominic Meiser { 36747f756511SDominic Meiser CsrMatrix *mat; 36757f756511SDominic Meiser 36767f756511SDominic Meiser PetscFunctionBegin; 36777f756511SDominic Meiser if (*matstruct) { 36787f756511SDominic Meiser if ((*matstruct)->mat) { 36797f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3680afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3681afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3682afb2bd1cSJunchao Zhang #else 36837f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 36845f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDestroyHybMat(hybMat)); 3685afb2bd1cSJunchao Zhang #endif 36867f756511SDominic Meiser } else { 36877f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 36887f756511SDominic Meiser CsrMatrix_Destroy(&mat); 36897f756511SDominic Meiser } 36907f756511SDominic Meiser } 36915f80ce2aSJacob Faibussowitsch if ((*matstruct)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 36927f756511SDominic Meiser delete (*matstruct)->cprowIndices; 36935f80ce2aSJacob Faibussowitsch if ((*matstruct)->alpha_one) CHKERRCUDA(cudaFree((*matstruct)->alpha_one)); 36945f80ce2aSJacob Faibussowitsch if ((*matstruct)->beta_zero) CHKERRCUDA(cudaFree((*matstruct)->beta_zero)); 36955f80ce2aSJacob Faibussowitsch if ((*matstruct)->beta_one) CHKERRCUDA(cudaFree((*matstruct)->beta_one)); 3696afb2bd1cSJunchao Zhang 3697afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3698afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 36995f80ce2aSJacob Faibussowitsch if (mdata->matDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3700afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3701afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 37025f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 37035f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 37045f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3705afb2bd1cSJunchao Zhang } 3706afb2bd1cSJunchao Zhang } 3707afb2bd1cSJunchao Zhang #endif 37087f756511SDominic Meiser delete *matstruct; 37097e8381f9SStefano Zampini *matstruct = NULL; 37107f756511SDominic Meiser } 37117f756511SDominic Meiser PetscFunctionReturn(0); 37127f756511SDominic Meiser } 37137f756511SDominic Meiser 3714e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 37157f756511SDominic Meiser { 37167f756511SDominic Meiser PetscFunctionBegin; 37177f756511SDominic Meiser if (*trifactors) { 37185f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 37195f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 37205f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 37215f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 37227f756511SDominic Meiser delete (*trifactors)->rpermIndices; 37237f756511SDominic Meiser delete (*trifactors)->cpermIndices; 37247f756511SDominic Meiser delete (*trifactors)->workVector; 37257e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 37267e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 37277e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 37285f80ce2aSJacob Faibussowitsch if ((*trifactors)->a_band_d) CHKERRCUDA(cudaFree((*trifactors)->a_band_d)); 37295f80ce2aSJacob Faibussowitsch if ((*trifactors)->i_band_d) CHKERRCUDA(cudaFree((*trifactors)->i_band_d)); 3730e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3731ccdfe979SStefano Zampini } 3732ccdfe979SStefano Zampini PetscFunctionReturn(0); 3733ccdfe979SStefano Zampini } 3734ccdfe979SStefano Zampini 3735ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3736ccdfe979SStefano Zampini { 3737ccdfe979SStefano Zampini cusparseHandle_t handle; 3738ccdfe979SStefano Zampini 3739ccdfe979SStefano Zampini PetscFunctionBegin; 3740ccdfe979SStefano Zampini if (*trifactors) { 37415f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 37427f756511SDominic Meiser if (handle = (*trifactors)->handle) { 37435f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDestroy(handle)); 37447f756511SDominic Meiser } 37455f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(*trifactors)); 37467f756511SDominic Meiser } 37477f756511SDominic Meiser PetscFunctionReturn(0); 37487f756511SDominic Meiser } 37497e8381f9SStefano Zampini 37507e8381f9SStefano Zampini struct IJCompare 37517e8381f9SStefano Zampini { 37527e8381f9SStefano Zampini __host__ __device__ 37537e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37547e8381f9SStefano Zampini { 37557e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 37567e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 37577e8381f9SStefano Zampini return false; 37587e8381f9SStefano Zampini } 37597e8381f9SStefano Zampini }; 37607e8381f9SStefano Zampini 37617e8381f9SStefano Zampini struct IJEqual 37627e8381f9SStefano Zampini { 37637e8381f9SStefano Zampini __host__ __device__ 37647e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37657e8381f9SStefano Zampini { 37667e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 37677e8381f9SStefano Zampini return true; 37687e8381f9SStefano Zampini } 37697e8381f9SStefano Zampini }; 37707e8381f9SStefano Zampini 37717e8381f9SStefano Zampini struct IJDiff 37727e8381f9SStefano Zampini { 37737e8381f9SStefano Zampini __host__ __device__ 37747e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37757e8381f9SStefano Zampini { 37767e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 37777e8381f9SStefano Zampini } 37787e8381f9SStefano Zampini }; 37797e8381f9SStefano Zampini 37807e8381f9SStefano Zampini struct IJSum 37817e8381f9SStefano Zampini { 37827e8381f9SStefano Zampini __host__ __device__ 37837e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37847e8381f9SStefano Zampini { 37857e8381f9SStefano Zampini return t1||t2; 37867e8381f9SStefano Zampini } 37877e8381f9SStefano Zampini }; 37887e8381f9SStefano Zampini 37897e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3790219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3791219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 37927e8381f9SStefano Zampini { 37937e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3794fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3795bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 379608391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 37977e8381f9SStefano Zampini CsrMatrix *matrix; 37987e8381f9SStefano Zampini PetscInt n; 37997e8381f9SStefano Zampini 38007e8381f9SStefano Zampini PetscFunctionBegin; 3801*28b400f6SJacob Faibussowitsch PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3802*28b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 38037e8381f9SStefano Zampini if (!cusp->cooPerm) { 38045f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 38055f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 38067e8381f9SStefano Zampini PetscFunctionReturn(0); 38077e8381f9SStefano Zampini } 38087e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 3809*28b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3810e61fc153SStefano Zampini if (!v) { 3811e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3812e61fc153SStefano Zampini goto finalize; 38137e8381f9SStefano Zampini } 3814e61fc153SStefano Zampini n = cusp->cooPerm->size(); 381508391a17SStefano Zampini if (isCudaMem(v)) { 381608391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 381708391a17SStefano Zampini } else { 3818e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3819e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 382008391a17SStefano Zampini d_v = cooPerm_v->data(); 38215f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 382208391a17SStefano Zampini } 38235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 3824e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3825ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3826bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 382708391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3828ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3829ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3830ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3831ddea5d60SJunchao Zhang */ 3832e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3833e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3834e61fc153SStefano Zampini delete cooPerm_w; 38357e8381f9SStefano Zampini } else { 3836ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 383708391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38387e8381f9SStefano Zampini matrix->values->begin())); 383908391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38407e8381f9SStefano Zampini matrix->values->end())); 3841ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 38427e8381f9SStefano Zampini } 38437e8381f9SStefano Zampini } else { 3844e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 384508391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3846e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 38477e8381f9SStefano Zampini } else { 384808391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38497e8381f9SStefano Zampini matrix->values->begin())); 385008391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38517e8381f9SStefano Zampini matrix->values->end())); 38527e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 38537e8381f9SStefano Zampini } 38547e8381f9SStefano Zampini } 38555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3856e61fc153SStefano Zampini finalize: 3857e61fc153SStefano Zampini delete cooPerm_v; 38587e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 38595f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 3860fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 38615f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 38625f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 38635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3864fcdce8c4SStefano Zampini a->reallocs = 0; 3865fcdce8c4SStefano Zampini A->info.mallocs += 0; 3866fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3867fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3868fcdce8c4SStefano Zampini A->num_ass++; 38697e8381f9SStefano Zampini PetscFunctionReturn(0); 38707e8381f9SStefano Zampini } 38717e8381f9SStefano Zampini 3872a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3873a49f1ed0SStefano Zampini { 3874a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3875a49f1ed0SStefano Zampini 3876a49f1ed0SStefano Zampini PetscFunctionBegin; 3877a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3878a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3879a49f1ed0SStefano Zampini if (destroy) { 38805f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3881a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3882a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3883a49f1ed0SStefano Zampini } 38841a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3885a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3886a49f1ed0SStefano Zampini } 3887a49f1ed0SStefano Zampini 38887e8381f9SStefano Zampini #include <thrust/binary_search.h> 3889219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3890219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 38917e8381f9SStefano Zampini { 38927e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 38937e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 38947e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 38957e8381f9SStefano Zampini 38967e8381f9SStefano Zampini PetscFunctionBegin; 38975f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp(A->rmap)); 38985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp(A->cmap)); 38997e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 39007e8381f9SStefano Zampini if (n != cooPerm_n) { 39017e8381f9SStefano Zampini delete cusp->cooPerm; 39027e8381f9SStefano Zampini delete cusp->cooPerm_a; 39037e8381f9SStefano Zampini cusp->cooPerm = NULL; 39047e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 39057e8381f9SStefano Zampini } 39067e8381f9SStefano Zampini if (n) { 39077e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 39087e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 39097e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 39107e8381f9SStefano Zampini 39117e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 39127e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 39137e8381f9SStefano Zampini 39145f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 39157e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 39167e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 3917ddea5d60SJunchao Zhang 3918ddea5d60SJunchao Zhang /* Ex. 3919ddea5d60SJunchao Zhang n = 6 3920ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 3921ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 3922ddea5d60SJunchao Zhang */ 39237e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 39247e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 39257e8381f9SStefano Zampini 39265f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 39277e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3928ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3929ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 39307e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 39317e8381f9SStefano Zampini 3932ddea5d60SJunchao Zhang /* 3933ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 3934ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 3935ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 3936ddea5d60SJunchao Zhang */ 3937ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3938ddea5d60SJunchao Zhang 3939ddea5d60SJunchao Zhang /* 3940ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 3941ddea5d60SJunchao Zhang ^ekey 3942ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 3943ddea5d60SJunchao Zhang ^nekye 3944ddea5d60SJunchao Zhang */ 39457e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 39467e8381f9SStefano Zampini delete cusp->cooPerm_a; 39477e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 3948ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3949ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3950ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3951ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3952ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 39537e8381f9SStefano Zampini w[0] = 0; 3954ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3955ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 39567e8381f9SStefano Zampini } 39577e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 3958ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3959ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3960ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 39615f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 39627e8381f9SStefano Zampini 39635f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 39647e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 39657e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 39667e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 39675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(A->rmap->n+1,&a->i)); 3968ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 39695f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39707e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3971fcdce8c4SStefano Zampini a->rmax = 0; 39725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a->nz,&a->a)); 39735f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(a->nz,&a->j)); 39745f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39755f80ce2aSJacob Faibussowitsch if (!a->ilen) CHKERRQ(PetscMalloc1(A->rmap->n,&a->ilen)); 39765f80ce2aSJacob Faibussowitsch if (!a->imax) CHKERRQ(PetscMalloc1(A->rmap->n,&a->imax)); 39777e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 39787e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 39797e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 39807e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3981fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 39827e8381f9SStefano Zampini } 3983fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 39847e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 39855f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 39865f80ce2aSJacob Faibussowitsch CHKERRQ(MatMarkDiagonal_SeqAIJ(A)); 39877e8381f9SStefano Zampini } else { 39885f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(A,0,NULL)); 39897e8381f9SStefano Zampini } 39905f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 39917e8381f9SStefano Zampini 39927e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3993e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 39945f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArrayzero(a->a,a->nz)); 39955f80ce2aSJacob Faibussowitsch CHKERRQ(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 39967e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 39975f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 39985f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 39997e8381f9SStefano Zampini PetscFunctionReturn(0); 40007e8381f9SStefano Zampini } 4001ed502f03SStefano Zampini 4002219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4003219fbbafSJunchao Zhang { 4004219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4005219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4006cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4007219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4008219fbbafSJunchao Zhang 4009219fbbafSJunchao Zhang PetscFunctionBegin; 40105f80ce2aSJacob Faibussowitsch CHKERRQ(MatResetPreallocationCOO_SeqAIJ(mat)); 40115f80ce2aSJacob Faibussowitsch CHKERRQ(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4012219fbbafSJunchao Zhang if (coo_i) { 40135f80ce2aSJacob Faibussowitsch CHKERRQ(PetscGetMemType(coo_i,&mtype)); 4014219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4015219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { 4016cbc6b225SStefano Zampini if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4017219fbbafSJunchao Zhang } 4018219fbbafSJunchao Zhang } 4019219fbbafSJunchao Zhang } 4020219fbbafSJunchao Zhang 4021219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 40225f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4023219fbbafSJunchao Zhang } else { 40245f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4025cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 40265f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mat)); 4027219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ*>(mat->data); 4028219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 40295f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 40305f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 40315f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 40325f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4033219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4034219fbbafSJunchao Zhang } 4035219fbbafSJunchao Zhang PetscFunctionReturn(0); 4036219fbbafSJunchao Zhang } 4037219fbbafSJunchao Zhang 4038b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4039219fbbafSJunchao Zhang { 4040219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4041219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4042b6c38306SJunchao Zhang for (; i<nnz; i+= grid_size) { 4043b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4044b6c38306SJunchao Zhang for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4045b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4046b6c38306SJunchao Zhang } 4047219fbbafSJunchao Zhang } 4048219fbbafSJunchao Zhang 4049219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4050219fbbafSJunchao Zhang { 4051219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4052219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4053219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4054219fbbafSJunchao Zhang PetscMemType memtype; 4055219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4056219fbbafSJunchao Zhang PetscScalar *Aa; 4057219fbbafSJunchao Zhang 4058219fbbafSJunchao Zhang PetscFunctionBegin; 4059219fbbafSJunchao Zhang if (dev->use_extended_coo) { 40605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscGetMemType(v,&memtype)); 4061219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 40625f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 40635f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4064219fbbafSJunchao Zhang } 4065219fbbafSJunchao Zhang 40665f80ce2aSJacob Faibussowitsch if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 40675f80ce2aSJacob Faibussowitsch else CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4068219fbbafSJunchao Zhang 4069cbc6b225SStefano Zampini if (Annz) { 4070b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4071cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 4072cbc6b225SStefano Zampini } 4073219fbbafSJunchao Zhang 40745f80ce2aSJacob Faibussowitsch if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 40755f80ce2aSJacob Faibussowitsch else CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4076219fbbafSJunchao Zhang 40775f80ce2aSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1)); 4078219fbbafSJunchao Zhang } else { 40795f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4080219fbbafSJunchao Zhang } 4081219fbbafSJunchao Zhang PetscFunctionReturn(0); 4082219fbbafSJunchao Zhang } 4083219fbbafSJunchao Zhang 40845b7e41feSStefano Zampini /*@C 40855b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 40865b7e41feSStefano Zampini 40875b7e41feSStefano Zampini Not collective 40885b7e41feSStefano Zampini 40895b7e41feSStefano Zampini Input Parameters: 40905b7e41feSStefano Zampini + A - the matrix 40915b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40925b7e41feSStefano Zampini 40935b7e41feSStefano Zampini Output Parameters: 40945b7e41feSStefano Zampini + ia - the CSR row pointers 40955b7e41feSStefano Zampini - ja - the CSR column indices 40965b7e41feSStefano Zampini 40975b7e41feSStefano Zampini Level: developer 40985b7e41feSStefano Zampini 40995b7e41feSStefano Zampini Notes: 41005b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 41015b7e41feSStefano Zampini 41025b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 41035b7e41feSStefano Zampini @*/ 41045f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41055f101d05SStefano Zampini { 41065f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 41075f101d05SStefano Zampini CsrMatrix *csr; 41085f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41095f101d05SStefano Zampini 41105f101d05SStefano Zampini PetscFunctionBegin; 41115f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41125f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 41135f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41142c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41155f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4116*28b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 41175f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 41185f101d05SStefano Zampini if (i) { 41195f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 41205f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 41215f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 41225f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 41235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 41245f101d05SStefano Zampini } 41255f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 41265f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 41275f101d05SStefano Zampini } 41285f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 41295f101d05SStefano Zampini PetscFunctionReturn(0); 41305f101d05SStefano Zampini } 41315f101d05SStefano Zampini 41325b7e41feSStefano Zampini /*@C 41335b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 41345b7e41feSStefano Zampini 41355b7e41feSStefano Zampini Not collective 41365b7e41feSStefano Zampini 41375b7e41feSStefano Zampini Input Parameters: 41385b7e41feSStefano Zampini + A - the matrix 41395b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 41405b7e41feSStefano Zampini 41415b7e41feSStefano Zampini Output Parameters: 41425b7e41feSStefano Zampini + ia - the CSR row pointers 41435b7e41feSStefano Zampini - ja - the CSR column indices 41445b7e41feSStefano Zampini 41455b7e41feSStefano Zampini Level: developer 41465b7e41feSStefano Zampini 41475b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 41485b7e41feSStefano Zampini @*/ 41495f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41505f101d05SStefano Zampini { 41515f101d05SStefano Zampini PetscFunctionBegin; 41525f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41535f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41545f101d05SStefano Zampini if (i) *i = NULL; 41555f101d05SStefano Zampini if (j) *j = NULL; 41565f101d05SStefano Zampini PetscFunctionReturn(0); 41575f101d05SStefano Zampini } 41585f101d05SStefano Zampini 41595b7e41feSStefano Zampini /*@C 41605b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41615b7e41feSStefano Zampini 41625b7e41feSStefano Zampini Not Collective 41635b7e41feSStefano Zampini 41645b7e41feSStefano Zampini Input Parameter: 41655b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41665b7e41feSStefano Zampini 41675b7e41feSStefano Zampini Output Parameter: 41685b7e41feSStefano Zampini . a - pointer to the device data 41695b7e41feSStefano Zampini 41705b7e41feSStefano Zampini Level: developer 41715b7e41feSStefano Zampini 41725b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41735b7e41feSStefano Zampini 41745b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 41755b7e41feSStefano Zampini @*/ 4176ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4177ed502f03SStefano Zampini { 4178ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4179ed502f03SStefano Zampini CsrMatrix *csr; 4180ed502f03SStefano Zampini 4181ed502f03SStefano Zampini PetscFunctionBegin; 4182ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4183ed502f03SStefano Zampini PetscValidPointer(a,2); 4184ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41852c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41865f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4187*28b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4188ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4189*28b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4190ed502f03SStefano Zampini *a = csr->values->data().get(); 4191ed502f03SStefano Zampini PetscFunctionReturn(0); 4192ed502f03SStefano Zampini } 4193ed502f03SStefano Zampini 41945b7e41feSStefano Zampini /*@C 41955b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 41965b7e41feSStefano Zampini 41975b7e41feSStefano Zampini Not Collective 41985b7e41feSStefano Zampini 41995b7e41feSStefano Zampini Input Parameter: 42005b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42015b7e41feSStefano Zampini 42025b7e41feSStefano Zampini Output Parameter: 42035b7e41feSStefano Zampini . a - pointer to the device data 42045b7e41feSStefano Zampini 42055b7e41feSStefano Zampini Level: developer 42065b7e41feSStefano Zampini 42075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 42085b7e41feSStefano Zampini @*/ 4209ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4210ed502f03SStefano Zampini { 4211ed502f03SStefano Zampini PetscFunctionBegin; 4212ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4213ed502f03SStefano Zampini PetscValidPointer(a,2); 4214ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4215ed502f03SStefano Zampini *a = NULL; 4216ed502f03SStefano Zampini PetscFunctionReturn(0); 4217ed502f03SStefano Zampini } 4218ed502f03SStefano Zampini 42195b7e41feSStefano Zampini /*@C 42205b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42215b7e41feSStefano Zampini 42225b7e41feSStefano Zampini Not Collective 42235b7e41feSStefano Zampini 42245b7e41feSStefano Zampini Input Parameter: 42255b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42265b7e41feSStefano Zampini 42275b7e41feSStefano Zampini Output Parameter: 42285b7e41feSStefano Zampini . a - pointer to the device data 42295b7e41feSStefano Zampini 42305b7e41feSStefano Zampini Level: developer 42315b7e41feSStefano Zampini 42325b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 42335b7e41feSStefano Zampini 42345b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 42355b7e41feSStefano Zampini @*/ 4236039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4237039c6fbaSStefano Zampini { 4238039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4239039c6fbaSStefano Zampini CsrMatrix *csr; 4240039c6fbaSStefano Zampini 4241039c6fbaSStefano Zampini PetscFunctionBegin; 4242039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4243039c6fbaSStefano Zampini PetscValidPointer(a,2); 4244039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42452c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 42465f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4247*28b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4248039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4249*28b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4250039c6fbaSStefano Zampini *a = csr->values->data().get(); 4251039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 42525f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4253039c6fbaSStefano Zampini PetscFunctionReturn(0); 4254039c6fbaSStefano Zampini } 42555b7e41feSStefano Zampini /*@C 42565b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4257039c6fbaSStefano Zampini 42585b7e41feSStefano Zampini Not Collective 42595b7e41feSStefano Zampini 42605b7e41feSStefano Zampini Input Parameter: 42615b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42625b7e41feSStefano Zampini 42635b7e41feSStefano Zampini Output Parameter: 42645b7e41feSStefano Zampini . a - pointer to the device data 42655b7e41feSStefano Zampini 42665b7e41feSStefano Zampini Level: developer 42675b7e41feSStefano Zampini 42685b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 42695b7e41feSStefano Zampini @*/ 4270039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4271039c6fbaSStefano Zampini { 4272039c6fbaSStefano Zampini PetscFunctionBegin; 4273039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4274039c6fbaSStefano Zampini PetscValidPointer(a,2); 4275039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42765f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 42775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 4278039c6fbaSStefano Zampini *a = NULL; 4279039c6fbaSStefano Zampini PetscFunctionReturn(0); 4280039c6fbaSStefano Zampini } 4281039c6fbaSStefano Zampini 42825b7e41feSStefano Zampini /*@C 42835b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42845b7e41feSStefano Zampini 42855b7e41feSStefano Zampini Not Collective 42865b7e41feSStefano Zampini 42875b7e41feSStefano Zampini Input Parameter: 42885b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42895b7e41feSStefano Zampini 42905b7e41feSStefano Zampini Output Parameter: 42915b7e41feSStefano Zampini . a - pointer to the device data 42925b7e41feSStefano Zampini 42935b7e41feSStefano Zampini Level: developer 42945b7e41feSStefano Zampini 42955b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 42965b7e41feSStefano Zampini 42975b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 42985b7e41feSStefano Zampini @*/ 4299ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4300ed502f03SStefano Zampini { 4301ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4302ed502f03SStefano Zampini CsrMatrix *csr; 4303ed502f03SStefano Zampini 4304ed502f03SStefano Zampini PetscFunctionBegin; 4305ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4306ed502f03SStefano Zampini PetscValidPointer(a,2); 4307ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 43082c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4309*28b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4310ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 4311*28b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4312ed502f03SStefano Zampini *a = csr->values->data().get(); 4313039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 43145f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4315ed502f03SStefano Zampini PetscFunctionReturn(0); 4316ed502f03SStefano Zampini } 4317ed502f03SStefano Zampini 43185b7e41feSStefano Zampini /*@C 43195b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 43205b7e41feSStefano Zampini 43215b7e41feSStefano Zampini Not Collective 43225b7e41feSStefano Zampini 43235b7e41feSStefano Zampini Input Parameter: 43245b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43255b7e41feSStefano Zampini 43265b7e41feSStefano Zampini Output Parameter: 43275b7e41feSStefano Zampini . a - pointer to the device data 43285b7e41feSStefano Zampini 43295b7e41feSStefano Zampini Level: developer 43305b7e41feSStefano Zampini 43315b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 43325b7e41feSStefano Zampini @*/ 4333ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4334ed502f03SStefano Zampini { 4335ed502f03SStefano Zampini PetscFunctionBegin; 4336ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4337ed502f03SStefano Zampini PetscValidPointer(a,2); 4338ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 43395f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 43405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 4341ed502f03SStefano Zampini *a = NULL; 4342ed502f03SStefano Zampini PetscFunctionReturn(0); 4343ed502f03SStefano Zampini } 4344ed502f03SStefano Zampini 4345ed502f03SStefano Zampini struct IJCompare4 4346ed502f03SStefano Zampini { 4347ed502f03SStefano Zampini __host__ __device__ 43482ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4349ed502f03SStefano Zampini { 4350ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4351ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4352ed502f03SStefano Zampini return false; 4353ed502f03SStefano Zampini } 4354ed502f03SStefano Zampini }; 4355ed502f03SStefano Zampini 43568909a122SStefano Zampini struct Shift 43578909a122SStefano Zampini { 4358ed502f03SStefano Zampini int _shift; 4359ed502f03SStefano Zampini 4360ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4361ed502f03SStefano Zampini __host__ __device__ 4362ed502f03SStefano Zampini inline int operator() (const int &c) 4363ed502f03SStefano Zampini { 4364ed502f03SStefano Zampini return c + _shift; 4365ed502f03SStefano Zampini } 4366ed502f03SStefano Zampini }; 4367ed502f03SStefano Zampini 4368ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4369ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4370ed502f03SStefano Zampini { 4371ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4372ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4373ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4374ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4375ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4376ed502f03SStefano Zampini cusparseStatus_t stat; 4377ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4378ed502f03SStefano Zampini 4379ed502f03SStefano Zampini PetscFunctionBegin; 4380ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4381ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4382ed502f03SStefano Zampini PetscValidPointer(C,4); 4383ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4384ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 43855f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 43862c71b3e2SJacob Faibussowitsch PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 43872c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 43882c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4389ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4390ed502f03SStefano Zampini m = A->rmap->n; 4391ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 43925f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,C)); 43935f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*C,m,n,m,n)); 43945f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*C,MATSEQAIJCUSPARSE)); 4395ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4396ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4397ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4398ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4399ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4400ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4401ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4402ed502f03SStefano Zampini c->compressedrow.i = NULL; 4403ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4404ed502f03SStefano Zampini Ccusp->workVector = NULL; 4405ed502f03SStefano Zampini Ccusp->nrows = m; 4406ed502f03SStefano Zampini Ccusp->mat = Cmat; 4407ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4408ed502f03SStefano Zampini Ccsr->num_rows = m; 4409ed502f03SStefano Zampini Ccsr->num_cols = n; 44105f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 44115f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 44125f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 44135f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 44145f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 44155f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 44165f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44175f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44185f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44195f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 44205f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 4421*28b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4422*28b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4423ed502f03SStefano Zampini 4424ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4425ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4426ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4427ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4428ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4429ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4430ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4431ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4432ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4433ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4434ed502f03SStefano Zampini if (c->nz) { 44352ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 44362ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 44372ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 44382ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 44392ed87e7eSStefano Zampini 4440ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4441ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4442ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4443ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 44445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4445ed502f03SStefano Zampini } 44462ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 44472ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4448ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4449ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4450ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4451ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 44525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4453ed502f03SStefano Zampini } 44542ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 44552ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 44565f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 44572ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 44582ed87e7eSStefano Zampini Aroff->data().get(), 44592ed87e7eSStefano Zampini Annz, 44602ed87e7eSStefano Zampini m, 44612ed87e7eSStefano Zampini Acoo->data().get(), 44622ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4463ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 44642ed87e7eSStefano Zampini Broff->data().get(), 4465ed502f03SStefano Zampini Bnnz, 4466ed502f03SStefano Zampini m, 44672ed87e7eSStefano Zampini Bcoo->data().get(), 4468ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 44692ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 44702ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 44712ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 44728909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4473ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4474ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 44758909a122SStefano Zampini #else 44768909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 44778909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 44788909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 44798909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 44808909a122SStefano Zampini #endif 44812ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 44822ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 44832ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 44842ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 44852ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 44862ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4487ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4488ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4489ed502f03SStefano Zampini thrust::advance(p2,Annz); 44902ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 44918909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 44928909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 44938909a122SStefano Zampini #endif 44942ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 44952ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 44962ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 44972ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 44982ed87e7eSStefano Zampini #else 44992ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 45002ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 45012ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 45022ed87e7eSStefano Zampini #endif 4503ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 45042ed87e7eSStefano Zampini Ccoo->data().get(), 4505ed502f03SStefano Zampini c->nz, 4506ed502f03SStefano Zampini m, 4507ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4508ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 45095f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 45102ed87e7eSStefano Zampini delete wPerm; 45112ed87e7eSStefano Zampini delete Acoo; 45122ed87e7eSStefano Zampini delete Bcoo; 45132ed87e7eSStefano Zampini delete Ccoo; 4514ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4515ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4516ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4517ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4518ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4519ed502f03SStefano Zampini #endif 45201a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 45215f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 45225f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4523ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4524ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4525ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4526ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4527ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4528ed502f03SStefano Zampini 45291a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 45301a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4531a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4532ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4533ed502f03SStefano Zampini CmatT->mat = CcsrT; 4534ed502f03SStefano Zampini CcsrT->num_rows = n; 4535ed502f03SStefano Zampini CcsrT->num_cols = m; 4536ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4537ed502f03SStefano Zampini 4538ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4539ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4540ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4541ed502f03SStefano Zampini 45425f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 4543ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4544ed502f03SStefano Zampini if (AT) { 4545ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4546ed502f03SStefano Zampini thrust::advance(rT,-1); 4547ed502f03SStefano Zampini } 4548ed502f03SStefano Zampini if (BT) { 4549ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4550ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4551ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4552ed502f03SStefano Zampini } 4553ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4554ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4555ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4556ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4557ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4558ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 45595f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 4560ed502f03SStefano Zampini 45615f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 45625f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 45635f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 45645f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 45655f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 45665f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 45675f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45685f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45695f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4570ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4571ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4572ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4573ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4574ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4575ed502f03SStefano Zampini #endif 4576ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4577ed502f03SStefano Zampini } 4578ed502f03SStefano Zampini } 4579ed502f03SStefano Zampini 4580ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4581ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4582ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 45835f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m+1,&c->i)); 45845f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(c->nz,&c->j)); 4585ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4586ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4587ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4588ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4589ed502f03SStefano Zampini jj = *Ccsr->column_indices; 45905f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45915f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4592ed502f03SStefano Zampini } else { 45935f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45945f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4595ed502f03SStefano Zampini } 45965f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 45975f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&c->ilen)); 45985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&c->imax)); 4599ed502f03SStefano Zampini c->maxnz = c->nz; 4600ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4601ed502f03SStefano Zampini c->rmax = 0; 4602ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4603ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4604ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4605ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4606ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4607ed502f03SStefano Zampini } 46085f80ce2aSJacob Faibussowitsch CHKERRQ(MatMarkDiagonal_SeqAIJ(*C)); 46095f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(c->nz,&c->a)); 4610ed502f03SStefano Zampini (*C)->nonzerostate++; 46115f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp((*C)->rmap)); 46125f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp((*C)->cmap)); 4613ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4614ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4615ed502f03SStefano Zampini } else { 46162c71b3e2SJacob Faibussowitsch PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4617ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4618ed502f03SStefano Zampini if (c->nz) { 4619ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 46205f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 46212c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 46222c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 46235f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 46245f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 46255f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 46265f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4627ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4628ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4629ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 46302c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 46312c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 46322c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 46332c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 46345f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4635ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4636ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 46375f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 4638ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4639ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4640ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4641ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4642ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4643ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4644ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4645ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4646ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4647ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 46485f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 46491a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 46505f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4651ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4652ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4653ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4654ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4655ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4656ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4657ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 46581a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4659ed502f03SStefano Zampini } 46605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 4661ed502f03SStefano Zampini } 4662ed502f03SStefano Zampini } 46635f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectStateIncrease((PetscObject)*C)); 4664ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4665ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4666ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4667ed502f03SStefano Zampini PetscFunctionReturn(0); 4668ed502f03SStefano Zampini } 4669c215019aSStefano Zampini 4670c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4671c215019aSStefano Zampini { 4672c215019aSStefano Zampini bool dmem; 4673c215019aSStefano Zampini const PetscScalar *av; 4674c215019aSStefano Zampini 4675c215019aSStefano Zampini PetscFunctionBegin; 4676c215019aSStefano Zampini dmem = isCudaMem(v); 46775f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4678c215019aSStefano Zampini if (n && idx) { 4679c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4680c215019aSStefano Zampini widx.assign(idx,idx+n); 46815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4682c215019aSStefano Zampini 4683c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4684c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4685c215019aSStefano Zampini if (dmem) { 4686c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4687c215019aSStefano Zampini } else { 4688c215019aSStefano Zampini w = new THRUSTARRAY(n); 4689c215019aSStefano Zampini dv = w->data(); 4690c215019aSStefano Zampini } 4691c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4692c215019aSStefano Zampini 4693c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4694c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4695c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4696c215019aSStefano Zampini if (w) { 46975f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4698c215019aSStefano Zampini } 4699c215019aSStefano Zampini delete w; 4700c215019aSStefano Zampini } else { 47015f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4702c215019aSStefano Zampini } 47035f80ce2aSJacob Faibussowitsch if (!dmem) CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 47045f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4705c215019aSStefano Zampini PetscFunctionReturn(0); 4706c215019aSStefano Zampini } 4707