19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94c215019aSStefano Zampini 95ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 969ae82921SPaul Mullowney { 979ae82921SPaul Mullowney PetscFunctionBegin; 989ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 999ae82921SPaul Mullowney PetscFunctionReturn(0); 1009ae82921SPaul Mullowney } 1019ae82921SPaul Mullowney 102c708e6cdSJed Brown /*MC 103087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 109c708e6cdSJed Brown 1109ae82921SPaul Mullowney Level: beginner 111c708e6cdSJed Brown 1123ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 113c708e6cdSJed Brown M*/ 1149ae82921SPaul Mullowney 11542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1169ae82921SPaul Mullowney { 117bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1189ae82921SPaul Mullowney 1199ae82921SPaul Mullowney PetscFunctionBegin; 1209566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1219566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B,n,n,n,n)); 1222c7c0729SBarry Smith (*B)->factortype = ftype; 1239566063dSJacob Faibussowitsch PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 1242205254eSKarl Rupp 1259566063dSJacob Faibussowitsch if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1279566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1289c1083e7SRichard Tran Mills if (!A->boundtocpu) { 1299ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1309ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1319c1083e7SRichard Tran Mills } else { 1329c1083e7SRichard Tran Mills (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1339c1083e7SRichard Tran Mills (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1349c1083e7SRichard Tran Mills } 1359566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1369566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1379566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1399c1083e7SRichard Tran Mills if (!A->boundtocpu) { 140087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1429c1083e7SRichard Tran Mills } else { 1439c1083e7SRichard Tran Mills (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1449c1083e7SRichard Tran Mills (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1459c1083e7SRichard Tran Mills } 1469566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1479566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1489ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149bc3f50f2SPaul Mullowney 1509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1514ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 1539ae82921SPaul Mullowney PetscFunctionReturn(0); 1549ae82921SPaul Mullowney } 1559ae82921SPaul Mullowney 156bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157ca45077fSPaul Mullowney { 158aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1596e111a19SKarl Rupp 160ca45077fSPaul Mullowney PetscFunctionBegin; 161ca45077fSPaul Mullowney switch (op) { 162e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 163aa372e3fSPaul Mullowney cusparsestruct->format = format; 164ca45077fSPaul Mullowney break; 165e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 166aa372e3fSPaul Mullowney cusparsestruct->format = format; 167ca45077fSPaul Mullowney break; 168ca45077fSPaul Mullowney default: 16998921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170ca45077fSPaul Mullowney } 171ca45077fSPaul Mullowney PetscFunctionReturn(0); 172ca45077fSPaul Mullowney } 1739ae82921SPaul Mullowney 174e057df02SPaul Mullowney /*@ 175e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 177aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 178e057df02SPaul Mullowney Not Collective 179e057df02SPaul Mullowney 180e057df02SPaul Mullowney Input Parameters: 1818468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 18236d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1832692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184e057df02SPaul Mullowney 185e057df02SPaul Mullowney Output Parameter: 186e057df02SPaul Mullowney 187e057df02SPaul Mullowney Level: intermediate 188e057df02SPaul Mullowney 1898468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 190e057df02SPaul Mullowney @*/ 191e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192e057df02SPaul Mullowney { 193e057df02SPaul Mullowney PetscFunctionBegin; 194e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 196e057df02SPaul Mullowney PetscFunctionReturn(0); 197e057df02SPaul Mullowney } 198e057df02SPaul Mullowney 199365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200365b711fSMark Adams { 201365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202365b711fSMark Adams 203365b711fSMark Adams PetscFunctionBegin; 204365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 205365b711fSMark Adams PetscFunctionReturn(0); 206365b711fSMark Adams } 207365b711fSMark Adams 208365b711fSMark Adams /*@ 209365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210365b711fSMark Adams 211365b711fSMark Adams Input Parameters: 212365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 213365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 214365b711fSMark Adams 215365b711fSMark Adams Output Parameter: 216365b711fSMark Adams 217365b711fSMark Adams Notes: 218365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 219365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221365b711fSMark Adams 222365b711fSMark Adams Level: intermediate 223365b711fSMark Adams 224365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 225365b711fSMark Adams @*/ 226365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227365b711fSMark Adams { 228365b711fSMark Adams PetscFunctionBegin; 229365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 231365b711fSMark Adams PetscFunctionReturn(0); 232365b711fSMark Adams } 233365b711fSMark Adams 2341a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235e6e9a74fSStefano Zampini { 236e6e9a74fSStefano Zampini PetscFunctionBegin; 2371a2c6b5cSJunchao Zhang switch (op) { 2381a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2391a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2409566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2411a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2421a2c6b5cSJunchao Zhang break; 2431a2c6b5cSJunchao Zhang default: 2449566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 2451a2c6b5cSJunchao Zhang break; 246e6e9a74fSStefano Zampini } 247e6e9a74fSStefano Zampini PetscFunctionReturn(0); 248e6e9a74fSStefano Zampini } 249e6e9a74fSStefano Zampini 250bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251bddcd29dSMark Adams 252bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253bddcd29dSMark Adams { 254bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 256bddcd29dSMark Adams PetscBool row_identity,col_identity; 257365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258bddcd29dSMark Adams 259bddcd29dSMark Adams PetscFunctionBegin; 2609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2619566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 263bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 2649566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 2659566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 266bddcd29dSMark Adams if (row_identity && col_identity) { 267365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 268bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 269bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 270365b711fSMark Adams } 271bddcd29dSMark Adams B->ops->matsolve = NULL; 272bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 273bddcd29dSMark Adams } else { 274365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 275bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 276bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 277365b711fSMark Adams } 278bddcd29dSMark Adams B->ops->matsolve = NULL; 279bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 280bddcd29dSMark Adams } 281bddcd29dSMark Adams 282bddcd29dSMark Adams /* get the triangular factors */ 283365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 2849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 285365b711fSMark Adams } 286bddcd29dSMark Adams PetscFunctionReturn(0); 287bddcd29dSMark Adams } 288bddcd29dSMark Adams 2894416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2909ae82921SPaul Mullowney { 2919ae82921SPaul Mullowney PetscErrorCode ierr; 292e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2939ae82921SPaul Mullowney PetscBool flg; 294a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2956e111a19SKarl Rupp 2969ae82921SPaul Mullowney PetscFunctionBegin; 2979566063dSJacob Faibussowitsch PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options")); 2989ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 299e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 3009566063dSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr); 3019566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 302afb2bd1cSJunchao Zhang 3034c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 3049566063dSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr); 3059566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 3069566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 3079566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 308afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 309afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 3109566063dSJacob Faibussowitsch "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr); 311afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 3128efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3132c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 314a435da06SStefano Zampini #else 3152c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 316a435da06SStefano Zampini #endif 317afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 3189566063dSJacob Faibussowitsch "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr); 3192c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 320afb2bd1cSJunchao Zhang 321afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 3229566063dSJacob Faibussowitsch "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr); 3232c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 324afb2bd1cSJunchao Zhang #endif 3254c87dfd4SPaul Mullowney } 3269566063dSJacob Faibussowitsch PetscCall(PetscOptionsTail()); 3279ae82921SPaul Mullowney PetscFunctionReturn(0); 3289ae82921SPaul Mullowney } 3299ae82921SPaul Mullowney 3306fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3319ae82921SPaul Mullowney { 332da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3339ae82921SPaul Mullowney 3349ae82921SPaul Mullowney PetscFunctionBegin; 3359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3369566063dSJacob Faibussowitsch PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 3379ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3389ae82921SPaul Mullowney PetscFunctionReturn(0); 3399ae82921SPaul Mullowney } 3409ae82921SPaul Mullowney 3416fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3429ae82921SPaul Mullowney { 343da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3449ae82921SPaul Mullowney 3459ae82921SPaul Mullowney PetscFunctionBegin; 3469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3479566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 3489ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3499ae82921SPaul Mullowney PetscFunctionReturn(0); 3509ae82921SPaul Mullowney } 3519ae82921SPaul Mullowney 352087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 353087f3262SPaul Mullowney { 354da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 355087f3262SPaul Mullowney 356087f3262SPaul Mullowney PetscFunctionBegin; 3579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3589566063dSJacob Faibussowitsch PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 359087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 360087f3262SPaul Mullowney PetscFunctionReturn(0); 361087f3262SPaul Mullowney } 362087f3262SPaul Mullowney 363087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 364087f3262SPaul Mullowney { 365da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 366087f3262SPaul Mullowney 367087f3262SPaul Mullowney PetscFunctionBegin; 3689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3699566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 370087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 371087f3262SPaul Mullowney PetscFunctionReturn(0); 372087f3262SPaul Mullowney } 373087f3262SPaul Mullowney 374087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3759ae82921SPaul Mullowney { 3769ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3779ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3789ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 379aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3809ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3819ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3829ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3839ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 3849ae82921SPaul Mullowney 3859ae82921SPaul Mullowney PetscFunctionBegin; 386cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 387c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3889ae82921SPaul Mullowney try { 3899ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3909ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 391da79fbbcSStefano Zampini if (!loTriFactor) { 3922cbc15d9SMark PetscScalar *AALo; 3932cbc15d9SMark 3949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 3959ae82921SPaul Mullowney 3969ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 3989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 3999ae82921SPaul Mullowney 4009ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 4019ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 4029ae82921SPaul Mullowney AiLo[n] = nzLower; 4039ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4049ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4059ae82921SPaul Mullowney v = aa; 4069ae82921SPaul Mullowney vi = aj; 4079ae82921SPaul Mullowney offset = 1; 4089ae82921SPaul Mullowney rowOffset= 1; 4099ae82921SPaul Mullowney for (i=1; i<n; i++) { 4109ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 411e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4129ae82921SPaul Mullowney AiLo[i] = rowOffset; 4139ae82921SPaul Mullowney rowOffset += nz+1; 4149ae82921SPaul Mullowney 4159566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 4169566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 4179ae82921SPaul Mullowney 4189ae82921SPaul Mullowney offset += nz; 4199ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4209ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4219ae82921SPaul Mullowney offset += 1; 4229ae82921SPaul Mullowney 4239ae82921SPaul Mullowney v += nz; 4249ae82921SPaul Mullowney vi += nz; 4259ae82921SPaul Mullowney } 4262205254eSKarl Rupp 427aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4289566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 429da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 430aa372e3fSPaul Mullowney /* Create the matrix description */ 4319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 4329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 435afb2bd1cSJunchao Zhang #else 4369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 437afb2bd1cSJunchao Zhang #endif 4389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 4399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 440aa372e3fSPaul Mullowney 441aa372e3fSPaul Mullowney /* set the operation */ 442aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 443aa372e3fSPaul Mullowney 444aa372e3fSPaul Mullowney /* set the matrix */ 445aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 446aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 447aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 448aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 449aa372e3fSPaul Mullowney 450aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 451aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 452aa372e3fSPaul Mullowney 453aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 454aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 455aa372e3fSPaul Mullowney 456aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 457aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 458aa372e3fSPaul Mullowney 459afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4609566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 4619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 4621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 464afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 465afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 466afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 4675f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 4689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 469afb2bd1cSJunchao Zhang #endif 470afb2bd1cSJunchao Zhang 471aa372e3fSPaul Mullowney /* perform the solve analysis */ 4729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 473aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 474aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 475d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 4761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 477d49cd2b7SBarry Smith loTriFactor->solveInfo, 4785f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 479d49cd2b7SBarry Smith #else 4805f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 481afb2bd1cSJunchao Zhang #endif 4829566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4839566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 484aa372e3fSPaul Mullowney 485da79fbbcSStefano Zampini /* assign the pointer */ 486aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4872cbc15d9SMark loTriFactor->AA_h = AALo; 4889566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4899566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4909566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 491da79fbbcSStefano Zampini } else { /* update values only */ 4922cbc15d9SMark if (!loTriFactor->AA_h) { 4939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 4942cbc15d9SMark } 495da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4962cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 497da79fbbcSStefano Zampini v = aa; 498da79fbbcSStefano Zampini vi = aj; 499da79fbbcSStefano Zampini offset = 1; 500da79fbbcSStefano Zampini for (i=1; i<n; i++) { 501da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 5029566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 503da79fbbcSStefano Zampini offset += nz; 5042cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 505da79fbbcSStefano Zampini offset += 1; 506da79fbbcSStefano Zampini v += nz; 507da79fbbcSStefano Zampini } 5082cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 5099566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 510da79fbbcSStefano Zampini } 5119ae82921SPaul Mullowney } catch(char *ex) { 51298921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5139ae82921SPaul Mullowney } 5149ae82921SPaul Mullowney } 5159ae82921SPaul Mullowney PetscFunctionReturn(0); 5169ae82921SPaul Mullowney } 5179ae82921SPaul Mullowney 518087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5199ae82921SPaul Mullowney { 5209ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5219ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5229ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 523aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5249ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5259ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5269ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5279ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5289ae82921SPaul Mullowney 5299ae82921SPaul Mullowney PetscFunctionBegin; 530cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 531c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5329ae82921SPaul Mullowney try { 5339ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5349ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 535da79fbbcSStefano Zampini if (!upTriFactor) { 5362cbc15d9SMark PetscScalar *AAUp; 5372cbc15d9SMark 5389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 5392cbc15d9SMark 5409ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 5419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 5429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 5439ae82921SPaul Mullowney 5449ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5459ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5469ae82921SPaul Mullowney AiUp[n]=nzUpper; 5479ae82921SPaul Mullowney offset = nzUpper; 5489ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5499ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5509ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5519ae82921SPaul Mullowney 552e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5539ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5549ae82921SPaul Mullowney 555e057df02SPaul Mullowney /* decrement the offset */ 5569ae82921SPaul Mullowney offset -= (nz+1); 5579ae82921SPaul Mullowney 558e057df02SPaul Mullowney /* first, set the diagonal elements */ 5599ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 56009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5619ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5629ae82921SPaul Mullowney 5639566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 5649566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 5659ae82921SPaul Mullowney } 5662205254eSKarl Rupp 567aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 5689566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 569da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5702205254eSKarl Rupp 571aa372e3fSPaul Mullowney /* Create the matrix description */ 5729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 5739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 5741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 5759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 576afb2bd1cSJunchao Zhang #else 5779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 578afb2bd1cSJunchao Zhang #endif 5799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 5809566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 581aa372e3fSPaul Mullowney 582aa372e3fSPaul Mullowney /* set the operation */ 583aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584aa372e3fSPaul Mullowney 585aa372e3fSPaul Mullowney /* set the matrix */ 586aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 587aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 588aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 589aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 590aa372e3fSPaul Mullowney 591aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593aa372e3fSPaul Mullowney 594aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596aa372e3fSPaul Mullowney 597aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599aa372e3fSPaul Mullowney 600afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6019566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 6029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 6031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 6049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 6085f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 6099566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 610afb2bd1cSJunchao Zhang #endif 611afb2bd1cSJunchao Zhang 612aa372e3fSPaul Mullowney /* perform the solve analysis */ 6139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618d49cd2b7SBarry Smith upTriFactor->solveInfo, 6195f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 620d49cd2b7SBarry Smith #else 6215f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 622afb2bd1cSJunchao Zhang #endif 6239566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 6249566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 625aa372e3fSPaul Mullowney 626da79fbbcSStefano Zampini /* assign the pointer */ 627aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6282cbc15d9SMark upTriFactor->AA_h = AAUp; 6299566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 6309566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 6319566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 632da79fbbcSStefano Zampini } else { 6332cbc15d9SMark if (!upTriFactor->AA_h) { 6349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 6352cbc15d9SMark } 636da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 637da79fbbcSStefano Zampini offset = nzUpper; 638da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 639da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 640da79fbbcSStefano Zampini 641da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 642da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 643da79fbbcSStefano Zampini 644da79fbbcSStefano Zampini /* decrement the offset */ 645da79fbbcSStefano Zampini offset -= (nz+1); 646da79fbbcSStefano Zampini 647da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6482cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6499566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 650da79fbbcSStefano Zampini } 6512cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 6529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 653da79fbbcSStefano Zampini } 6549ae82921SPaul Mullowney } catch(char *ex) { 65598921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6569ae82921SPaul Mullowney } 6579ae82921SPaul Mullowney } 6589ae82921SPaul Mullowney PetscFunctionReturn(0); 6599ae82921SPaul Mullowney } 6609ae82921SPaul Mullowney 661087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6629ae82921SPaul Mullowney { 6639ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6649ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6659ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6669ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6679ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6689ae82921SPaul Mullowney 6699ae82921SPaul Mullowney PetscFunctionBegin; 67028b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 6719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 6729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 6732205254eSKarl Rupp 674da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 675aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6769ae82921SPaul Mullowney 677c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 678e057df02SPaul Mullowney /* lower triangular indices */ 6799566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 680da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 681da79fbbcSStefano Zampini const PetscInt *r; 682da79fbbcSStefano Zampini 6839566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r)); 684aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 685aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6869566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r)); 6879566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 688da79fbbcSStefano Zampini } 6899ae82921SPaul Mullowney 690e057df02SPaul Mullowney /* upper triangular indices */ 6919566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 692da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 693da79fbbcSStefano Zampini const PetscInt *c; 694da79fbbcSStefano Zampini 6959566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&c)); 696aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 697aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&c)); 6999566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 700da79fbbcSStefano Zampini } 7019ae82921SPaul Mullowney PetscFunctionReturn(0); 7029ae82921SPaul Mullowney } 7039ae82921SPaul Mullowney 704087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 705087f3262SPaul Mullowney { 706087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 707087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 708aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 709aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 710087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 711087f3262SPaul Mullowney PetscScalar *AAUp; 712087f3262SPaul Mullowney PetscScalar *AALo; 713087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 714087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 715087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 716087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 717087f3262SPaul Mullowney 718087f3262SPaul Mullowney PetscFunctionBegin; 719cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 720c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 721087f3262SPaul Mullowney try { 7229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 7239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 724da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 725087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 7269566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 7279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 728087f3262SPaul Mullowney 729087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 730087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 731087f3262SPaul Mullowney AiUp[n]=nzUpper; 732087f3262SPaul Mullowney offset = 0; 733087f3262SPaul Mullowney for (i=0; i<n; i++) { 734087f3262SPaul Mullowney /* set the pointers */ 735087f3262SPaul Mullowney v = aa + ai[i]; 736087f3262SPaul Mullowney vj = aj + ai[i]; 737087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 738087f3262SPaul Mullowney 739087f3262SPaul Mullowney /* first, set the diagonal elements */ 740087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 74109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 742087f3262SPaul Mullowney AiUp[i] = offset; 74309f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 744087f3262SPaul Mullowney 745087f3262SPaul Mullowney offset+=1; 746087f3262SPaul Mullowney if (nz>0) { 7479566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 7489566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 749087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 750087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 751087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 752087f3262SPaul Mullowney } 753087f3262SPaul Mullowney offset+=nz; 754087f3262SPaul Mullowney } 755087f3262SPaul Mullowney } 756087f3262SPaul Mullowney 757aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 7589566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 759da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 760087f3262SPaul Mullowney 761aa372e3fSPaul Mullowney /* Create the matrix description */ 7629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 7639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 7641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 766afb2bd1cSJunchao Zhang #else 7679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 768afb2bd1cSJunchao Zhang #endif 7699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 771087f3262SPaul Mullowney 772aa372e3fSPaul Mullowney /* set the matrix */ 773aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 774aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 775aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 776aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 777aa372e3fSPaul Mullowney 778aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 779aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 780aa372e3fSPaul Mullowney 781aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 782aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 783aa372e3fSPaul Mullowney 784aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 785aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 786aa372e3fSPaul Mullowney 787afb2bd1cSJunchao Zhang /* set the operation */ 788afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 789afb2bd1cSJunchao Zhang 790afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7919566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 7929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 7931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 795afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 796afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 797afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 7985f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 7999566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 800afb2bd1cSJunchao Zhang #endif 801afb2bd1cSJunchao Zhang 802aa372e3fSPaul Mullowney /* perform the solve analysis */ 8039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 804aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 805aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 806d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8071b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 808d49cd2b7SBarry Smith upTriFactor->solveInfo, 8095f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 810d49cd2b7SBarry Smith #else 8115f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 812afb2bd1cSJunchao Zhang #endif 8139566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8149566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 815aa372e3fSPaul Mullowney 816da79fbbcSStefano Zampini /* assign the pointer */ 817aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 818aa372e3fSPaul Mullowney 819aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8209566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 821da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822aa372e3fSPaul Mullowney 823aa372e3fSPaul Mullowney /* Create the matrix description */ 8249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 8259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8261b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828afb2bd1cSJunchao Zhang #else 8299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830afb2bd1cSJunchao Zhang #endif 8319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 833aa372e3fSPaul Mullowney 834aa372e3fSPaul Mullowney /* set the operation */ 835aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 836aa372e3fSPaul Mullowney 837aa372e3fSPaul Mullowney /* set the matrix */ 838aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 839aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 840aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 841aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 842aa372e3fSPaul Mullowney 843aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 844aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 845aa372e3fSPaul Mullowney 846aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 847aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 848aa372e3fSPaul Mullowney 849aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 850aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 851aa372e3fSPaul Mullowney 852afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8539566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 8549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 8551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 857afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 858afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 859afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 8605f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 8619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 862afb2bd1cSJunchao Zhang #endif 863afb2bd1cSJunchao Zhang 864aa372e3fSPaul Mullowney /* perform the solve analysis */ 8659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 866aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 867aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 868d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 8691b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 870d49cd2b7SBarry Smith loTriFactor->solveInfo, 8715f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 872d49cd2b7SBarry Smith #else 8735f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 874afb2bd1cSJunchao Zhang #endif 8759566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8769566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 877aa372e3fSPaul Mullowney 878da79fbbcSStefano Zampini /* assign the pointer */ 879aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 880087f3262SPaul Mullowney 8819566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 8829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 8839566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 884da79fbbcSStefano Zampini } else { 885da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 886da79fbbcSStefano Zampini offset = 0; 887da79fbbcSStefano Zampini for (i=0; i<n; i++) { 888da79fbbcSStefano Zampini /* set the pointers */ 889da79fbbcSStefano Zampini v = aa + ai[i]; 890da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 891da79fbbcSStefano Zampini 892da79fbbcSStefano Zampini /* first, set the diagonal elements */ 893da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 894da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 895da79fbbcSStefano Zampini 896da79fbbcSStefano Zampini offset+=1; 897da79fbbcSStefano Zampini if (nz>0) { 8989566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 899da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 900da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 901da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 902da79fbbcSStefano Zampini } 903da79fbbcSStefano Zampini offset+=nz; 904da79fbbcSStefano Zampini } 905da79fbbcSStefano Zampini } 90628b400f6SJacob Faibussowitsch PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 90728b400f6SJacob Faibussowitsch PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 909da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 9109566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 911da79fbbcSStefano Zampini } 9129566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 9139566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 914087f3262SPaul Mullowney } catch(char *ex) { 91598921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 916087f3262SPaul Mullowney } 917087f3262SPaul Mullowney } 918087f3262SPaul Mullowney PetscFunctionReturn(0); 919087f3262SPaul Mullowney } 920087f3262SPaul Mullowney 921087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9229ae82921SPaul Mullowney { 923087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 924087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925087f3262SPaul Mullowney IS ip = a->row; 926087f3262SPaul Mullowney PetscBool perm_identity; 927087f3262SPaul Mullowney PetscInt n = A->rmap->n; 928087f3262SPaul Mullowney 929087f3262SPaul Mullowney PetscFunctionBegin; 93028b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 9319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 932da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 933aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 934aa372e3fSPaul Mullowney 935da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 936da79fbbcSStefano Zampini 937087f3262SPaul Mullowney /* lower triangular indices */ 9389566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 939087f3262SPaul Mullowney if (!perm_identity) { 9404e4bbfaaSStefano Zampini IS iip; 941da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9424e4bbfaaSStefano Zampini 9439566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 9449566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip,&irip)); 9459566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip,&rip)); 946aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 947aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 948aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9494e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9509566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip,&irip)); 9519566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 9529566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip,&rip)); 9539566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 954da79fbbcSStefano Zampini } 955087f3262SPaul Mullowney PetscFunctionReturn(0); 956087f3262SPaul Mullowney } 957087f3262SPaul Mullowney 958087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 959087f3262SPaul Mullowney { 960087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 961087f3262SPaul Mullowney IS ip = b->row; 962087f3262SPaul Mullowney PetscBool perm_identity; 963087f3262SPaul Mullowney 964087f3262SPaul Mullowney PetscFunctionBegin; 9659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 9669566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 967ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 968087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 9699566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 970087f3262SPaul Mullowney if (perm_identity) { 971087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 972087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9734e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9744e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 975087f3262SPaul Mullowney } else { 976087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 977087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9784e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9794e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 980087f3262SPaul Mullowney } 981087f3262SPaul Mullowney 982087f3262SPaul Mullowney /* get the triangular factors */ 9839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 984087f3262SPaul Mullowney PetscFunctionReturn(0); 985087f3262SPaul Mullowney } 9869ae82921SPaul Mullowney 987b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 988bda325fcSPaul Mullowney { 989bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 990aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 991aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 992da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 993da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 994aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 995aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 996aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 997aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 998b175d8bbSPaul Mullowney 999bda325fcSPaul Mullowney PetscFunctionBegin; 1000aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10019566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 1002da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1003aa372e3fSPaul Mullowney 1004aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1005aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1006aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1007aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1008aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1009aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1010aa372e3fSPaul Mullowney 1011aa372e3fSPaul Mullowney /* Create the matrix description */ 10129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1017aa372e3fSPaul Mullowney 1018aa372e3fSPaul Mullowney /* set the operation */ 1019aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1020aa372e3fSPaul Mullowney 1021aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1022aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1023afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1024afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1025aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1026afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1027afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1028afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1029aa372e3fSPaul Mullowney 1030aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1031afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 10329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1033afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1034afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1035afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1036afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1037afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1038afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 10405f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 10419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1042afb2bd1cSJunchao Zhang #endif 1043afb2bd1cSJunchao Zhang 10449566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 10459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1048aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1049aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1050aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1051afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1053afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 10545f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1055afb2bd1cSJunchao Zhang #else 1056afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 10575f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1058afb2bd1cSJunchao Zhang #endif 10599566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10609566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1061aa372e3fSPaul Mullowney 1062afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 10639566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 10649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 10651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 10669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 10705f80ce2aSJacob Faibussowitsch &loTriFactorT->solveBufferSize)); 10719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1072afb2bd1cSJunchao Zhang #endif 1073afb2bd1cSJunchao Zhang 1074afb2bd1cSJunchao Zhang /* perform the solve analysis */ 10759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 10791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080d49cd2b7SBarry Smith loTriFactorT->solveInfo, 10815f80ce2aSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1082d49cd2b7SBarry Smith #else 10835f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 1084afb2bd1cSJunchao Zhang #endif 10859566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10869566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1087aa372e3fSPaul Mullowney 1088da79fbbcSStefano Zampini /* assign the pointer */ 1089aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1090aa372e3fSPaul Mullowney 1091aa372e3fSPaul Mullowney /*********************************************/ 1092aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1093aa372e3fSPaul Mullowney /*********************************************/ 1094aa372e3fSPaul Mullowney 1095aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 10969566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1097da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1098aa372e3fSPaul Mullowney 1099aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1100aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1101aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1102aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1103aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1104aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1105aa372e3fSPaul Mullowney 1106aa372e3fSPaul Mullowney /* Create the matrix description */ 11079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1112aa372e3fSPaul Mullowney 1113aa372e3fSPaul Mullowney /* set the operation */ 1114aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1115aa372e3fSPaul Mullowney 1116aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1117aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1118afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1119afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1120aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1121afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1122afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1123afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1124aa372e3fSPaul Mullowney 1125aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1126afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 11279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1128afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1129afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1130afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1131afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1132afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1133afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1134afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 11355f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1137afb2bd1cSJunchao Zhang #endif 1138afb2bd1cSJunchao Zhang 11399566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 11409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1141aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1142aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1143aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1144aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1145aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1146afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1148afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 11495f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1150afb2bd1cSJunchao Zhang #else 1151afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11525f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1153afb2bd1cSJunchao Zhang #endif 1154d49cd2b7SBarry Smith 11559566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11569566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1157aa372e3fSPaul Mullowney 1158afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11599566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 11609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 11611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 11629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1163afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1164afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1165afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 11665f80ce2aSJacob Faibussowitsch &upTriFactorT->solveBufferSize)); 11679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1168afb2bd1cSJunchao Zhang #endif 1169afb2bd1cSJunchao Zhang 1170afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11715f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 11729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 11761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1177d49cd2b7SBarry Smith upTriFactorT->solveInfo, 11785f80ce2aSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1179d49cd2b7SBarry Smith #else 11805f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1181afb2bd1cSJunchao Zhang #endif 1182d49cd2b7SBarry Smith 11839566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11849566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1185aa372e3fSPaul Mullowney 1186da79fbbcSStefano Zampini /* assign the pointer */ 1187aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1188bda325fcSPaul Mullowney PetscFunctionReturn(0); 1189bda325fcSPaul Mullowney } 1190bda325fcSPaul Mullowney 1191a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1192a49f1ed0SStefano Zampini { 1193a49f1ed0SStefano Zampini __host__ __device__ 1194a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1195a49f1ed0SStefano Zampini { 1196a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1197a49f1ed0SStefano Zampini } 1198a49f1ed0SStefano Zampini }; 1199a49f1ed0SStefano Zampini 12003606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1201bda325fcSPaul Mullowney { 1202aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1203a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1204bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1205bda325fcSPaul Mullowney cusparseStatus_t stat; 1206aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1207b175d8bbSPaul Mullowney 1208bda325fcSPaul Mullowney PetscFunctionBegin; 12099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1210a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 121128b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1212a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1213*08401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12141a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 12159566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 12169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1217a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 12189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1219a49f1ed0SStefano Zampini } 1220a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1221aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1223aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1226aa372e3fSPaul Mullowney 1227b06137fdSPaul Mullowney /* set alpha and beta */ 12289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 12299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 12309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 12319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 12329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 12339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1234b06137fdSPaul Mullowney 1235aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1236aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1237a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1238554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1239554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1240aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1241a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1242aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1243aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1244a3fdcf43SKarl Rupp 1245039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 124681902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1247afb2bd1cSJunchao Zhang 1248afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 12493606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1250afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1251afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1252afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1253afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1254afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 12559566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 12563606e59fSJunchao Zhang #else 12573606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12583606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12593606e59fSJunchao Zhang 12603606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12613606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12623606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12633606e59fSJunchao Zhang */ 12643606e59fSJunchao Zhang if (matrixT->num_entries) { 12653606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 12663606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 12673606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 12683606e59fSJunchao Zhang matrixT->values->data().get(), 12693606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 12709566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 12713606e59fSJunchao Zhang 12723606e59fSJunchao Zhang } else { 12733606e59fSJunchao Zhang matstructT->matDescr = NULL; 12743606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12753606e59fSJunchao Zhang } 12763606e59fSJunchao Zhang #endif 1277afb2bd1cSJunchao Zhang #endif 1278aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1280afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281afb2bd1cSJunchao Zhang #else 1282aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 128351c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 128451c6d536SStefano Zampini /* First convert HYB to CSR */ 1285aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1286aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1287aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1288aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1291aa372e3fSPaul Mullowney 1292aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1293aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1294aa372e3fSPaul Mullowney temp->values->data().get(), 1295aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 12969566063dSJacob Faibussowitsch temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1297aa372e3fSPaul Mullowney 1298aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1299aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1300aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1301aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1302aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1303aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1304aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1305aa372e3fSPaul Mullowney 1306aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1307aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1308aa372e3fSPaul Mullowney temp->values->data().get(), 1309aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1310aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1311aa372e3fSPaul Mullowney tempT->values->data().get(), 1312aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1313aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 13149566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1315aa372e3fSPaul Mullowney 1316aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1317aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1319aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1320aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1321aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1322aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1323aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1324aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 13259566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1326aa372e3fSPaul Mullowney 1327aa372e3fSPaul Mullowney /* assign the pointer */ 1328aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13291a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1330aa372e3fSPaul Mullowney /* delete temporaries */ 1331aa372e3fSPaul Mullowney if (tempT) { 1332aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1333aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1334aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1335aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1336087f3262SPaul Mullowney } 1337aa372e3fSPaul Mullowney if (temp) { 1338aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1339aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1340aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1341aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1342aa372e3fSPaul Mullowney } 1343afb2bd1cSJunchao Zhang #endif 1344aa372e3fSPaul Mullowney } 1345a49f1ed0SStefano Zampini } 1346a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1347a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1348a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 134928b400f6SJacob Faibussowitsch PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 135028b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 135128b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 135228b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 135328b400f6SJacob Faibussowitsch PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 135428b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 135528b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 135628b400f6SJacob Faibussowitsch PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1357a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1358a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1359a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 13609566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1361a49f1ed0SStefano Zampini } 1362a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1363a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1364a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1365a49f1ed0SStefano Zampini 1366a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1367a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1368a49f1ed0SStefano Zampini void *csr2cscBuffer; 1369a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1370a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1371a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1372a49f1ed0SStefano Zampini matrix->values->data().get(), 1373a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1374a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1375a49f1ed0SStefano Zampini matrixT->values->data().get(), 1376a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1377a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13789566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 13799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1380a49f1ed0SStefano Zampini #endif 1381a49f1ed0SStefano Zampini 13821a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13831a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13841a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13851a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13861a2c6b5cSJunchao Zhang 13871a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13881a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13891a2c6b5cSJunchao Zhang */ 13901a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 13911a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 13921a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 13931a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 13941a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1395a49f1ed0SStefano Zampini matrixT->values->data().get(), 1396a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13999566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1400a49f1ed0SStefano Zampini #else 1401a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14029566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1403a49f1ed0SStefano Zampini #endif 14041a2c6b5cSJunchao Zhang } else { 14051a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14061a2c6b5cSJunchao Zhang } 14071a2c6b5cSJunchao Zhang 1408a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1409a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1410a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 14119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1412a49f1ed0SStefano Zampini #endif 1413a49f1ed0SStefano Zampini } 1414a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1415a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1416a49f1ed0SStefano Zampini matrixT->values->begin())); 1417a49f1ed0SStefano Zampini } 14189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14199566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1420213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1421213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1422aa372e3fSPaul Mullowney /* assign the pointer */ 1423aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14241a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1425bda325fcSPaul Mullowney PetscFunctionReturn(0); 1426bda325fcSPaul Mullowney } 1427bda325fcSPaul Mullowney 1428a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1430bda325fcSPaul Mullowney { 1431c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1432465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1433465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1434465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1435465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1436bda325fcSPaul Mullowney cusparseStatus_t stat; 1437bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1438aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1439aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1440aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1441bda325fcSPaul Mullowney 1442bda325fcSPaul Mullowney PetscFunctionBegin; 1443aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1444aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 14459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1446aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1447aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1448bda325fcSPaul Mullowney } 1449bda325fcSPaul Mullowney 1450bda325fcSPaul Mullowney /* Get the GPU pointers */ 14519566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14529566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1453c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1454c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1455bda325fcSPaul Mullowney 14569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1457aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1458a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1459c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1460c41cb2e2SAlejandro Lamas Daviña xGPU); 1461aa372e3fSPaul Mullowney 1462aa372e3fSPaul Mullowney /* First, solve U */ 1463aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1464afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1466afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1467afb2bd1cSJunchao Zhang #endif 1468afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1469aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1470aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1471aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1472aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1473d49cd2b7SBarry Smith xarray, 14741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475d49cd2b7SBarry Smith tempGPU->data().get(), 14769566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1477d49cd2b7SBarry Smith #else 14789566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1479afb2bd1cSJunchao Zhang #endif 1480aa372e3fSPaul Mullowney 1481aa372e3fSPaul Mullowney /* Then, solve L */ 1482aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1483afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1485afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1486afb2bd1cSJunchao Zhang #endif 1487afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1488aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1489aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1490aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1491aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1492d49cd2b7SBarry Smith tempGPU->data().get(), 14931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1494d49cd2b7SBarry Smith xarray, 14959566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1496d49cd2b7SBarry Smith #else 14979566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1498afb2bd1cSJunchao Zhang #endif 1499aa372e3fSPaul Mullowney 1500aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1501a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1502c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1503aa372e3fSPaul Mullowney tempGPU->begin()); 1504aa372e3fSPaul Mullowney 1505aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1506a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1507bda325fcSPaul Mullowney 1508bda325fcSPaul Mullowney /* restore */ 15099566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15109566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1513bda325fcSPaul Mullowney PetscFunctionReturn(0); 1514bda325fcSPaul Mullowney } 1515bda325fcSPaul Mullowney 15166fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1517bda325fcSPaul Mullowney { 1518465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1519465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1520bda325fcSPaul Mullowney cusparseStatus_t stat; 1521bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1522aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1523aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1524aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1525bda325fcSPaul Mullowney 1526bda325fcSPaul Mullowney PetscFunctionBegin; 1527aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1528aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1530aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1531aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1532bda325fcSPaul Mullowney } 1533bda325fcSPaul Mullowney 1534bda325fcSPaul Mullowney /* Get the GPU pointers */ 15359566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 15369566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1537bda325fcSPaul Mullowney 15389566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1539aa372e3fSPaul Mullowney /* First, solve U */ 1540aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1541afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1544afb2bd1cSJunchao Zhang #endif 1545afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1546aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1547aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1548aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1549aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1550d49cd2b7SBarry Smith barray, 15511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552d49cd2b7SBarry Smith tempGPU->data().get(), 15539566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1554d49cd2b7SBarry Smith #else 15559566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1556afb2bd1cSJunchao Zhang #endif 1557aa372e3fSPaul Mullowney 1558aa372e3fSPaul Mullowney /* Then, solve L */ 1559aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1560afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1563afb2bd1cSJunchao Zhang #endif 1564afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1565aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1566aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1567aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1568aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1569d49cd2b7SBarry Smith tempGPU->data().get(), 15701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571d49cd2b7SBarry Smith xarray, 15729566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1573d49cd2b7SBarry Smith #else 15749566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1575afb2bd1cSJunchao Zhang #endif 1576bda325fcSPaul Mullowney 1577bda325fcSPaul Mullowney /* restore */ 15789566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15799566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1582bda325fcSPaul Mullowney PetscFunctionReturn(0); 1583bda325fcSPaul Mullowney } 1584bda325fcSPaul Mullowney 15856fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15869ae82921SPaul Mullowney { 1587465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1588465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1589465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1590465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15919ae82921SPaul Mullowney cusparseStatus_t stat; 15929ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1593aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1594aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1595aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 15969ae82921SPaul Mullowney 15979ae82921SPaul Mullowney PetscFunctionBegin; 1598ebc8f436SDominic Meiser 1599e057df02SPaul Mullowney /* Get the GPU pointers */ 16009566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 16019566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1602c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1603c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16049ae82921SPaul Mullowney 16059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1606aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1607a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1608c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16094e4bbfaaSStefano Zampini tempGPU->begin()); 1610aa372e3fSPaul Mullowney 1611aa372e3fSPaul Mullowney /* Next, solve L */ 1612aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1613afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1616afb2bd1cSJunchao Zhang #endif 1617afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1618aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1619aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1620aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1621aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1622d49cd2b7SBarry Smith tempGPU->data().get(), 16231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624d49cd2b7SBarry Smith xarray, 16259566063dSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1626d49cd2b7SBarry Smith #else 16279566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1628afb2bd1cSJunchao Zhang #endif 1629aa372e3fSPaul Mullowney 1630aa372e3fSPaul Mullowney /* Then, solve U */ 1631aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1632afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1635afb2bd1cSJunchao Zhang #endif 1636afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1637aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1638aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1639aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1640d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 16411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1642d49cd2b7SBarry Smith tempGPU->data().get(), 16439566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1644d49cd2b7SBarry Smith #else 16459566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1646afb2bd1cSJunchao Zhang #endif 1647d49cd2b7SBarry Smith 16484e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1649a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16504e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16514e4bbfaaSStefano Zampini xGPU); 16529ae82921SPaul Mullowney 16539566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 16549566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 16559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 16579ae82921SPaul Mullowney PetscFunctionReturn(0); 16589ae82921SPaul Mullowney } 16599ae82921SPaul Mullowney 16606fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16619ae82921SPaul Mullowney { 1662465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1663465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16649ae82921SPaul Mullowney cusparseStatus_t stat; 16659ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1666aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1667aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1668aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 16699ae82921SPaul Mullowney 16709ae82921SPaul Mullowney PetscFunctionBegin; 1671e057df02SPaul Mullowney /* Get the GPU pointers */ 16729566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 16739566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 16749ae82921SPaul Mullowney 16759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1676aa372e3fSPaul Mullowney /* First, solve L */ 1677aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1678afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1680afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1681afb2bd1cSJunchao Zhang #endif 1682afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1683aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1684aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1685aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1686aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1687d49cd2b7SBarry Smith barray, 16881b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689d49cd2b7SBarry Smith tempGPU->data().get(), 16909566063dSJacob Faibussowitsch loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1691d49cd2b7SBarry Smith #else 16929566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1693afb2bd1cSJunchao Zhang #endif 1694d49cd2b7SBarry Smith 1695aa372e3fSPaul Mullowney /* Next, solve U */ 1696aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1697afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1699afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1700afb2bd1cSJunchao Zhang #endif 1701afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1702aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1703aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1704aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1705aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1706d49cd2b7SBarry Smith tempGPU->data().get(), 17071b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1708d49cd2b7SBarry Smith xarray, 17099566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1710d49cd2b7SBarry Smith #else 17119566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1712afb2bd1cSJunchao Zhang #endif 17139ae82921SPaul Mullowney 17149566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 17159566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 17169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 17189ae82921SPaul Mullowney PetscFunctionReturn(0); 17199ae82921SPaul Mullowney } 17209ae82921SPaul Mullowney 17217e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17227e8381f9SStefano Zampini { 17237e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17247e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17257e8381f9SStefano Zampini 17267e8381f9SStefano Zampini PetscFunctionBegin; 17277e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17287e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17297e8381f9SStefano Zampini 17309566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 17329566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 17339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 17349566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17357e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17367e8381f9SStefano Zampini } 17377e8381f9SStefano Zampini PetscFunctionReturn(0); 17387e8381f9SStefano Zampini } 17397e8381f9SStefano Zampini 17407e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17417e8381f9SStefano Zampini { 17427e8381f9SStefano Zampini PetscFunctionBegin; 17439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 174467a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 174567a45760SJunchao Zhang PetscFunctionReturn(0); 174667a45760SJunchao Zhang } 174767a45760SJunchao Zhang 174867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 174967a45760SJunchao Zhang { 175067a45760SJunchao Zhang PetscFunctionBegin; 17517e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 175267a45760SJunchao Zhang *array = NULL; 175367a45760SJunchao Zhang PetscFunctionReturn(0); 175467a45760SJunchao Zhang } 175567a45760SJunchao Zhang 175667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 175767a45760SJunchao Zhang { 175867a45760SJunchao Zhang PetscFunctionBegin; 17599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 176067a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 176167a45760SJunchao Zhang PetscFunctionReturn(0); 176267a45760SJunchao Zhang } 176367a45760SJunchao Zhang 176467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 176567a45760SJunchao Zhang { 176667a45760SJunchao Zhang PetscFunctionBegin; 176767a45760SJunchao Zhang *array = NULL; 176867a45760SJunchao Zhang PetscFunctionReturn(0); 176967a45760SJunchao Zhang } 177067a45760SJunchao Zhang 177167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 177267a45760SJunchao Zhang { 177367a45760SJunchao Zhang PetscFunctionBegin; 177467a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 177567a45760SJunchao Zhang PetscFunctionReturn(0); 177667a45760SJunchao Zhang } 177767a45760SJunchao Zhang 177867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 177967a45760SJunchao Zhang { 178067a45760SJunchao Zhang PetscFunctionBegin; 178167a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 178267a45760SJunchao Zhang *array = NULL; 17837e8381f9SStefano Zampini PetscFunctionReturn(0); 17847e8381f9SStefano Zampini } 17857e8381f9SStefano Zampini 17867ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 17877ee59b9bSJunchao Zhang { 17887ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 17897ee59b9bSJunchao Zhang CsrMatrix *matrix; 17907ee59b9bSJunchao Zhang 17917ee59b9bSJunchao Zhang PetscFunctionBegin; 17927ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 17937ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 17947ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 17957ee59b9bSJunchao Zhang PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 17967ee59b9bSJunchao Zhang matrix = (CsrMatrix*)cusp->mat->mat; 17977ee59b9bSJunchao Zhang 17987ee59b9bSJunchao Zhang if (i) { 17997ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 18007ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 18017ee59b9bSJunchao Zhang #else 18027ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 18037ee59b9bSJunchao Zhang #endif 18047ee59b9bSJunchao Zhang } 18057ee59b9bSJunchao Zhang if (j) { 18067ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 18077ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 18087ee59b9bSJunchao Zhang #else 18097ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 18107ee59b9bSJunchao Zhang #endif 18117ee59b9bSJunchao Zhang } 18127ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 18137ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 18147ee59b9bSJunchao Zhang PetscFunctionReturn(0); 18157ee59b9bSJunchao Zhang } 18167ee59b9bSJunchao Zhang 1817042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 18189ae82921SPaul Mullowney { 1819aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 18207c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 18219ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1822213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1823aa372e3fSPaul Mullowney cusparseStatus_t stat; 1824abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 18259ae82921SPaul Mullowney 18269ae82921SPaul Mullowney PetscFunctionBegin; 182728b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1828c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1829a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1830a49f1ed0SStefano Zampini CsrMatrix *matrix; 1831afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 183285ba7357SStefano Zampini 1833*08401ef6SPierre Jolivet PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 18349566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1835afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 18369566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 18379566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 18389566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 184034d6c7a5SJose E. Roman } else { 1841abb89eb1SStefano Zampini PetscInt nnz; 18429566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 18449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 18457c700b8dSJunchao Zhang delete cusparsestruct->workVector; 184681902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1847a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1848a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18499ae82921SPaul Mullowney try { 18509ae82921SPaul Mullowney if (a->compressedrow.use) { 18519ae82921SPaul Mullowney m = a->compressedrow.nrows; 18529ae82921SPaul Mullowney ii = a->compressedrow.i; 18539ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 18549ae82921SPaul Mullowney } else { 1855213423ffSJunchao Zhang m = A->rmap->n; 1856213423ffSJunchao Zhang ii = a->i; 1857e6e9a74fSStefano Zampini ridx = NULL; 18589ae82921SPaul Mullowney } 1859*08401ef6SPierre Jolivet PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1860abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1861abb89eb1SStefano Zampini else nnz = a->nz; 1862*08401ef6SPierre Jolivet PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 18639ae82921SPaul Mullowney 186485ba7357SStefano Zampini /* create cusparse matrix */ 1865abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1866aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 18679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 18689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 18699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 18709ae82921SPaul Mullowney 18719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 18729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 18739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 18749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1878b06137fdSPaul Mullowney 1879aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1880aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1881aa372e3fSPaul Mullowney /* set the matrix */ 1882afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1883afb2bd1cSJunchao Zhang mat->num_rows = m; 1884afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1885abb89eb1SStefano Zampini mat->num_entries = nnz; 1886afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1887afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18889ae82921SPaul Mullowney 1889abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1890abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1891aa372e3fSPaul Mullowney 1892abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1893abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1894aa372e3fSPaul Mullowney 1895aa372e3fSPaul Mullowney /* assign the pointer */ 1896afb2bd1cSJunchao Zhang matstruct->mat = mat; 1897afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1898afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1899afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1900afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1901afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1902afb2bd1cSJunchao Zhang mat->values->data().get(), 1903afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 19049566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1905afb2bd1cSJunchao Zhang } 1906afb2bd1cSJunchao Zhang #endif 1907aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1908afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1909afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1910afb2bd1cSJunchao Zhang #else 1911afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1912afb2bd1cSJunchao Zhang mat->num_rows = m; 1913afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1914abb89eb1SStefano Zampini mat->num_entries = nnz; 1915afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1916afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1917aa372e3fSPaul Mullowney 1918abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1919abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1920aa372e3fSPaul Mullowney 1921abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1922abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1923aa372e3fSPaul Mullowney 1924aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 19259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1926aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1927aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1928afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1929afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1930afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1931afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 19329566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1933aa372e3fSPaul Mullowney /* assign the pointer */ 1934aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1935aa372e3fSPaul Mullowney 1936afb2bd1cSJunchao Zhang if (mat) { 1937afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1938afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1939afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1940afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1941087f3262SPaul Mullowney } 1942afb2bd1cSJunchao Zhang #endif 1943087f3262SPaul Mullowney } 1944ca45077fSPaul Mullowney 1945aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1946213423ffSJunchao Zhang if (a->compressedrow.use) { 1947213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1948aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1949aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1950213423ffSJunchao Zhang tmp = m; 1951213423ffSJunchao Zhang } else { 1952213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1953213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1954213423ffSJunchao Zhang tmp = 0; 1955213423ffSJunchao Zhang } 19569566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1957aa372e3fSPaul Mullowney 1958aa372e3fSPaul Mullowney /* assign the pointer */ 1959aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 19609ae82921SPaul Mullowney } catch(char *ex) { 196198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 19629ae82921SPaul Mullowney } 19639566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 19649566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 196534d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 196634d6c7a5SJose E. Roman } 1967abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 19689ae82921SPaul Mullowney } 19699ae82921SPaul Mullowney PetscFunctionReturn(0); 19709ae82921SPaul Mullowney } 19719ae82921SPaul Mullowney 1972c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1973aa372e3fSPaul Mullowney { 1974aa372e3fSPaul Mullowney template <typename Tuple> 1975aa372e3fSPaul Mullowney __host__ __device__ 1976aa372e3fSPaul Mullowney void operator()(Tuple t) 1977aa372e3fSPaul Mullowney { 1978aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1979aa372e3fSPaul Mullowney } 1980aa372e3fSPaul Mullowney }; 1981aa372e3fSPaul Mullowney 19827e8381f9SStefano Zampini struct VecCUDAEquals 19837e8381f9SStefano Zampini { 19847e8381f9SStefano Zampini template <typename Tuple> 19857e8381f9SStefano Zampini __host__ __device__ 19867e8381f9SStefano Zampini void operator()(Tuple t) 19877e8381f9SStefano Zampini { 19887e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19897e8381f9SStefano Zampini } 19907e8381f9SStefano Zampini }; 19917e8381f9SStefano Zampini 1992e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1993e6e9a74fSStefano Zampini { 1994e6e9a74fSStefano Zampini template <typename Tuple> 1995e6e9a74fSStefano Zampini __host__ __device__ 1996e6e9a74fSStefano Zampini void operator()(Tuple t) 1997e6e9a74fSStefano Zampini { 1998e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1999e6e9a74fSStefano Zampini } 2000e6e9a74fSStefano Zampini }; 2001e6e9a74fSStefano Zampini 2002afb2bd1cSJunchao Zhang struct MatMatCusparse { 2003ccdfe979SStefano Zampini PetscBool cisdense; 2004ccdfe979SStefano Zampini PetscScalar *Bt; 2005ccdfe979SStefano Zampini Mat X; 2006fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2007fcdce8c4SStefano Zampini PetscLogDouble flops; 2008fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2009b4285af6SJunchao Zhang 2010afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2011fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2012afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2013afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2014afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2015afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2016b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2017b4285af6SJunchao Zhang void *dBuffer4; 2018b4285af6SJunchao Zhang void *dBuffer5; 2019b4285af6SJunchao Zhang #endif 2020fcdce8c4SStefano Zampini size_t mmBufferSize; 2021fcdce8c4SStefano Zampini void *mmBuffer; 2022fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2023fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2024afb2bd1cSJunchao Zhang #endif 2025afb2bd1cSJunchao Zhang }; 2026ccdfe979SStefano Zampini 2027ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2028ccdfe979SStefano Zampini { 2029ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2030ccdfe979SStefano Zampini 2031ccdfe979SStefano Zampini PetscFunctionBegin; 20329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2033fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2034afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20359566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 20369566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 20379566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 20389566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2039b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 20409566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 20419566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2042b4285af6SJunchao Zhang #endif 20439566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 20449566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2045afb2bd1cSJunchao Zhang #endif 20469566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 20479566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2048ccdfe979SStefano Zampini PetscFunctionReturn(0); 2049ccdfe979SStefano Zampini } 2050ccdfe979SStefano Zampini 2051ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2052ccdfe979SStefano Zampini 2053ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2054ccdfe979SStefano Zampini { 2055ccdfe979SStefano Zampini Mat_Product *product = C->product; 2056ccdfe979SStefano Zampini Mat A,B; 2057afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2058ccdfe979SStefano Zampini PetscBool flg,biscuda; 2059ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2060ccdfe979SStefano Zampini cusparseStatus_t stat; 2061ccdfe979SStefano Zampini cusparseOperation_t opA; 2062ccdfe979SStefano Zampini const PetscScalar *barray; 2063ccdfe979SStefano Zampini PetscScalar *carray; 2064ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2065ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2066ccdfe979SStefano Zampini CsrMatrix *csrmat; 2067ccdfe979SStefano Zampini 2068ccdfe979SStefano Zampini PetscFunctionBegin; 2069ccdfe979SStefano Zampini MatCheckProduct(C,1); 207028b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2071ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2072ccdfe979SStefano Zampini A = product->A; 2073ccdfe979SStefano Zampini B = product->B; 20749566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 207528b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2076ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2077ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 207828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 20799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2080ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2081ccdfe979SStefano Zampini switch (product->type) { 2082ccdfe979SStefano Zampini case MATPRODUCT_AB: 2083ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2084ccdfe979SStefano Zampini mat = cusp->mat; 2085ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2086ccdfe979SStefano Zampini m = A->rmap->n; 2087ccdfe979SStefano Zampini n = B->cmap->n; 2088ccdfe979SStefano Zampini break; 2089ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20901a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2091e6e9a74fSStefano Zampini mat = cusp->mat; 2092e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2093e6e9a74fSStefano Zampini } else { 20949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2095ccdfe979SStefano Zampini mat = cusp->matTranspose; 2096ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2097e6e9a74fSStefano Zampini } 2098ccdfe979SStefano Zampini m = A->cmap->n; 2099ccdfe979SStefano Zampini n = B->cmap->n; 2100ccdfe979SStefano Zampini break; 2101ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2102ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2103ccdfe979SStefano Zampini mat = cusp->mat; 2104ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2105ccdfe979SStefano Zampini m = A->rmap->n; 2106ccdfe979SStefano Zampini n = B->rmap->n; 2107ccdfe979SStefano Zampini break; 2108ccdfe979SStefano Zampini default: 210998921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2110ccdfe979SStefano Zampini } 211128b400f6SJacob Faibussowitsch PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2112ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2113ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 21149566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 21159566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 21169566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2117afb2bd1cSJunchao Zhang 21189566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B,&blda)); 2119c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 21209566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 21219566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2122c8378d12SStefano Zampini } else { 21239566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 21249566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C,&clda)); 2125c8378d12SStefano Zampini } 2126c8378d12SStefano Zampini 21279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2128afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2129afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2130a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2131afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2132fcdce8c4SStefano Zampini size_t mmBufferSize; 21339566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2134afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 21359566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2136afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2137afb2bd1cSJunchao Zhang } 2138c8378d12SStefano Zampini 21399566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2140afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 21419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2142afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2143afb2bd1cSJunchao Zhang } 2144afb2bd1cSJunchao Zhang 2145afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2146afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2147afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2148afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2149afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2150afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 21519566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2152afb2bd1cSJunchao Zhang } 2153afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2154afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2155afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 21569566063dSJacob Faibussowitsch cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2157fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 21589566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 21599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2160fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2161fcdce8c4SStefano Zampini } 2162afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2163afb2bd1cSJunchao Zhang } else { 2164afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 21659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 21669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 21679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2168afb2bd1cSJunchao Zhang } 2169afb2bd1cSJunchao Zhang 2170afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2171afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2172afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2173afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 21749566063dSJacob Faibussowitsch cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2175afb2bd1cSJunchao Zhang #else 2176afb2bd1cSJunchao Zhang PetscInt k; 2177afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2178ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2179ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2180ccdfe979SStefano Zampini cublasStatus_t cerr; 2181ccdfe979SStefano Zampini 21829566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2183ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2184ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2185ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2186ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 21879566063dSJacob Faibussowitsch mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2188ccdfe979SStefano Zampini blda = B->cmap->n; 2189afb2bd1cSJunchao Zhang k = B->cmap->n; 2190afb2bd1cSJunchao Zhang } else { 2191afb2bd1cSJunchao Zhang k = B->rmap->n; 2192ccdfe979SStefano Zampini } 2193ccdfe979SStefano Zampini 2194afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2195ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2196afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2197ccdfe979SStefano Zampini csrmat->values->data().get(), 2198ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2199ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2200ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 22019566063dSJacob Faibussowitsch carray,clda);PetscCallCUSPARSE(stat); 2202afb2bd1cSJunchao Zhang #endif 22039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 22049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 22059566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2206ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 22079566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22089566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2209ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 22109566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22119566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2212ccdfe979SStefano Zampini } else { 22139566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2214ccdfe979SStefano Zampini } 2215ccdfe979SStefano Zampini if (mmdata->cisdense) { 22169566063dSJacob Faibussowitsch PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2217ccdfe979SStefano Zampini } 2218ccdfe979SStefano Zampini if (!biscuda) { 22199566063dSJacob Faibussowitsch PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2220ccdfe979SStefano Zampini } 2221ccdfe979SStefano Zampini PetscFunctionReturn(0); 2222ccdfe979SStefano Zampini } 2223ccdfe979SStefano Zampini 2224ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2225ccdfe979SStefano Zampini { 2226ccdfe979SStefano Zampini Mat_Product *product = C->product; 2227ccdfe979SStefano Zampini Mat A,B; 2228ccdfe979SStefano Zampini PetscInt m,n; 2229ccdfe979SStefano Zampini PetscBool cisdense,flg; 2230ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2231ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2232ccdfe979SStefano Zampini 2233ccdfe979SStefano Zampini PetscFunctionBegin; 2234ccdfe979SStefano Zampini MatCheckProduct(C,1); 223528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2236ccdfe979SStefano Zampini A = product->A; 2237ccdfe979SStefano Zampini B = product->B; 22389566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 223928b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2240ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2241*08401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2242ccdfe979SStefano Zampini switch (product->type) { 2243ccdfe979SStefano Zampini case MATPRODUCT_AB: 2244ccdfe979SStefano Zampini m = A->rmap->n; 2245ccdfe979SStefano Zampini n = B->cmap->n; 2246ccdfe979SStefano Zampini break; 2247ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2248ccdfe979SStefano Zampini m = A->cmap->n; 2249ccdfe979SStefano Zampini n = B->cmap->n; 2250ccdfe979SStefano Zampini break; 2251ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2252ccdfe979SStefano Zampini m = A->rmap->n; 2253ccdfe979SStefano Zampini n = B->rmap->n; 2254ccdfe979SStefano Zampini break; 2255ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2256ccdfe979SStefano Zampini m = B->cmap->n; 2257ccdfe979SStefano Zampini n = B->cmap->n; 2258ccdfe979SStefano Zampini break; 2259ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2260ccdfe979SStefano Zampini m = B->rmap->n; 2261ccdfe979SStefano Zampini n = B->rmap->n; 2262ccdfe979SStefano Zampini break; 2263ccdfe979SStefano Zampini default: 226498921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2265ccdfe979SStefano Zampini } 22669566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 2267ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 22689566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 22699566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2270ccdfe979SStefano Zampini 2271ccdfe979SStefano Zampini /* product data */ 22729566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2273ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2274afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2275afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2276ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 22779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2278ccdfe979SStefano Zampini } 2279afb2bd1cSJunchao Zhang #endif 2280ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2281ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 22829566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 22839566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2284ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 22859566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2286ccdfe979SStefano Zampini } else { 22879566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2288ccdfe979SStefano Zampini } 2289ccdfe979SStefano Zampini } 2290ccdfe979SStefano Zampini C->product->data = mmdata; 2291ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2292ccdfe979SStefano Zampini 2293ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2294ccdfe979SStefano Zampini PetscFunctionReturn(0); 2295ccdfe979SStefano Zampini } 2296ccdfe979SStefano Zampini 2297fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2298ccdfe979SStefano Zampini { 2299ccdfe979SStefano Zampini Mat_Product *product = C->product; 2300fcdce8c4SStefano Zampini Mat A,B; 2301fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2302fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2303fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2304fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2305fcdce8c4SStefano Zampini PetscBool flg; 2306fcdce8c4SStefano Zampini cusparseStatus_t stat; 2307fcdce8c4SStefano Zampini MatProductType ptype; 2308fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2309fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2310fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2311fcdce8c4SStefano Zampini #endif 2312b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2313ccdfe979SStefano Zampini 2314ccdfe979SStefano Zampini PetscFunctionBegin; 2315ccdfe979SStefano Zampini MatCheckProduct(C,1); 231628b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 23179566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 231828b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2319fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2320fcdce8c4SStefano Zampini A = product->A; 2321fcdce8c4SStefano Zampini B = product->B; 2322fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2323fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2324fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2325*08401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2326fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 232728b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2328fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 232928b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2330fcdce8c4SStefano Zampini goto finalize; 2331fcdce8c4SStefano Zampini } 2332fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 23339566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 233428b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 23359566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 233628b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 233728b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 233828b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2339fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2340fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2341fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2342*08401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2343*08401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2344*08401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 23459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 23469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2347fcdce8c4SStefano Zampini 2348fcdce8c4SStefano Zampini ptype = product->type; 2349fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2350fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 235128b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2352fa046f9fSJunchao Zhang } 2353fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2354fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 235528b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2356fa046f9fSJunchao Zhang } 2357fcdce8c4SStefano Zampini switch (ptype) { 2358fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2359fcdce8c4SStefano Zampini Amat = Acusp->mat; 2360fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2361fcdce8c4SStefano Zampini break; 2362fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2363fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2364fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2365fcdce8c4SStefano Zampini break; 2366fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2367fcdce8c4SStefano Zampini Amat = Acusp->mat; 2368fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2369fcdce8c4SStefano Zampini break; 2370fcdce8c4SStefano Zampini default: 237198921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2372fcdce8c4SStefano Zampini } 2373fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 237428b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 237528b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 237628b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2377fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2378fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2379fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 238028b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 238128b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 238228b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 23839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2384fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2385fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 23869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2387b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2388b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2389b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2390b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 23919566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2392b4285af6SJunchao Zhang #else 2393b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2394fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2395fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 23969566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2397b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2398fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 23999566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2400b4285af6SJunchao Zhang #endif 2401fcdce8c4SStefano Zampini #else 2402b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2403fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2404fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2405fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 24069566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2407fcdce8c4SStefano Zampini #endif 24089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 24099566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 24109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2411fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2412fcdce8c4SStefano Zampini finalize: 2413fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 24149566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 24159566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 24169566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2417fcdce8c4SStefano Zampini c->reallocs = 0; 2418fcdce8c4SStefano Zampini C->info.mallocs += 0; 2419fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2420fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2421fcdce8c4SStefano Zampini C->num_ass++; 2422ccdfe979SStefano Zampini PetscFunctionReturn(0); 2423ccdfe979SStefano Zampini } 2424fcdce8c4SStefano Zampini 2425fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2426fcdce8c4SStefano Zampini { 2427fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2428fcdce8c4SStefano Zampini Mat A,B; 2429fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2430fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2431fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2432fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2433fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2434fcdce8c4SStefano Zampini PetscBool flg; 2435fcdce8c4SStefano Zampini cusparseStatus_t stat; 2436fcdce8c4SStefano Zampini MatProductType ptype; 2437fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2438fcdce8c4SStefano Zampini PetscLogDouble flops; 2439fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2440fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2441fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2442fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2443fcdce8c4SStefano Zampini #else 2444fcdce8c4SStefano Zampini int cnz; 2445fcdce8c4SStefano Zampini #endif 2446b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2447fcdce8c4SStefano Zampini 2448fcdce8c4SStefano Zampini PetscFunctionBegin; 2449fcdce8c4SStefano Zampini MatCheckProduct(C,1); 245028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2451fcdce8c4SStefano Zampini A = product->A; 2452fcdce8c4SStefano Zampini B = product->B; 24539566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 245428b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 24559566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 245628b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2457fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2458fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2459fcdce8c4SStefano Zampini /* product data */ 24609566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2461fcdce8c4SStefano Zampini C->product->data = mmdata; 2462fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2463fcdce8c4SStefano Zampini 24649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 24659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2466d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2467d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2468*08401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2469*08401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2470d60bce21SJunchao Zhang 2471fcdce8c4SStefano Zampini ptype = product->type; 2472fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2473fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2474fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2475fa046f9fSJunchao Zhang } 2476fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2477fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2478fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2479fa046f9fSJunchao Zhang } 2480fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2481fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2482fcdce8c4SStefano Zampini switch (ptype) { 2483fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2484fcdce8c4SStefano Zampini m = A->rmap->n; 2485fcdce8c4SStefano Zampini n = B->cmap->n; 2486fcdce8c4SStefano Zampini k = A->cmap->n; 2487fcdce8c4SStefano Zampini Amat = Acusp->mat; 2488fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2489fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2490fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2491fcdce8c4SStefano Zampini break; 2492fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2493fcdce8c4SStefano Zampini m = A->cmap->n; 2494fcdce8c4SStefano Zampini n = B->cmap->n; 2495fcdce8c4SStefano Zampini k = A->rmap->n; 24969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2497fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2498fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2499fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2500fcdce8c4SStefano Zampini break; 2501fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2502fcdce8c4SStefano Zampini m = A->rmap->n; 2503fcdce8c4SStefano Zampini n = B->rmap->n; 2504fcdce8c4SStefano Zampini k = A->cmap->n; 25059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2506fcdce8c4SStefano Zampini Amat = Acusp->mat; 2507fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2508fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2509fcdce8c4SStefano Zampini break; 2510fcdce8c4SStefano Zampini default: 251198921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2512fcdce8c4SStefano Zampini } 2513fcdce8c4SStefano Zampini 2514fcdce8c4SStefano Zampini /* create cusparse matrix */ 25159566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 25169566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2517fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2518fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2519fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2520fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2521fcdce8c4SStefano Zampini 2522fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2523fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2524fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 25259566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 25269566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2527fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2528fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2529fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2530fcdce8c4SStefano Zampini } else { 2531fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2532fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2533fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2534fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2535fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2536fcdce8c4SStefano Zampini } 2537fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2538fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2539fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2540fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2541fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2542fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 25439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 25449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 25459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 25469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 25479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 25489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 25499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2552fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2553fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2554fcdce8c4SStefano Zampini c->nz = 0; 2555fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2556fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2557fcdce8c4SStefano Zampini goto finalizesym; 2558fcdce8c4SStefano Zampini } 2559fcdce8c4SStefano Zampini 256028b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 256128b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2562fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2563fcdce8c4SStefano Zampini if (!biscompressed) { 2564fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2565fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2566fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2567fcdce8c4SStefano Zampini #endif 2568fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2569fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2570fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2571fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2572fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2573fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2574fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2575fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2576fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2577fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2578fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 25799566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2580fcdce8c4SStefano Zampini } 2581fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2582fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2583fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2584fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2585fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2586fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2587fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2588fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 25899566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2590fcdce8c4SStefano Zampini } 2591fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2592fcdce8c4SStefano Zampini #endif 2593fcdce8c4SStefano Zampini } 259428b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 259528b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2596fcdce8c4SStefano Zampini /* precompute flops count */ 2597fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2598fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2599fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2600fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2601fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2602fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2603fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2604fcdce8c4SStefano Zampini } 2605fcdce8c4SStefano Zampini } 2606fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2607fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2608fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2609fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2610fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2611fcdce8c4SStefano Zampini } 2612fcdce8c4SStefano Zampini } else { /* TODO */ 2613fcdce8c4SStefano Zampini flops = 0.; 2614fcdce8c4SStefano Zampini } 2615fcdce8c4SStefano Zampini 2616fcdce8c4SStefano Zampini mmdata->flops = flops; 26179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2618b4285af6SJunchao Zhang 2619fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 26209566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2621fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2622fcdce8c4SStefano Zampini NULL, NULL, NULL, 2623fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 26249566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 26259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2626b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2627b4285af6SJunchao Zhang { 2628b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2629b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2630b4285af6SJunchao Zhang */ 2631b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2632b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2633b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2634b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2635b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2636b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2637b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2638b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2639b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2640b4285af6SJunchao Zhang 2641b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2642b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2643b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2644b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26459566063dSJacob Faibussowitsch &bufferSize1, NULL);PetscCallCUSPARSE(stat); 26469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2647b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2648b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2649b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26509566063dSJacob Faibussowitsch &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2651b4285af6SJunchao Zhang 2652b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2653b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2654b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26559566063dSJacob Faibussowitsch &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 26569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 26579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 26589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2659b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2660b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26619566063dSJacob Faibussowitsch &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 26629566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 26639566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 2664b4285af6SJunchao Zhang 2665b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2666b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 26679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2668b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2669b4285af6SJunchao Zhang /* allocate matrix C */ 26709566063dSJacob Faibussowitsch Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 26719566063dSJacob Faibussowitsch Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2672b4285af6SJunchao Zhang /* update matC with the new pointers */ 2673b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 26749566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2675b4285af6SJunchao Zhang 2676b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2677b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2678b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26799566063dSJacob Faibussowitsch &bufferSize5, NULL);PetscCallCUSPARSE(stat); 26809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2681b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2682b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26839566063dSJacob Faibussowitsch &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 26849566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 2685b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2686b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2687b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26889566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 26899566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2690b4285af6SJunchao Zhang } 2691ae37ee31SJunchao Zhang #else 2692b4285af6SJunchao Zhang size_t bufSize2; 2693fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2694b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2695fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2696fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26979566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 26989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2699fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2700b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2701fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2702fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27039566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2704fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2705b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2706fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2707fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27089566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2709fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2710fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2711fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2712fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2713fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 27149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2715fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2716b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2717fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2718fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27199566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2720fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 27219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2722fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 27239566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2724fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 27259566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2726fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 27279566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2728fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 27299566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2730b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2731fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 27329566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2733ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2734fcdce8c4SStefano Zampini #else 27359566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2736b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2737fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2738fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2739fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 27409566063dSJacob Faibussowitsch Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2741fcdce8c4SStefano Zampini c->nz = cnz; 2742fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 27439566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2744fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 27459566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2746fcdce8c4SStefano Zampini 27479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2748fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2749fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2750fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2751b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2752fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2753fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2754fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 27559566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2756fcdce8c4SStefano Zampini #endif 27579566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 27589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2759fcdce8c4SStefano Zampini finalizesym: 2760fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2761fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2762fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 27639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 27649566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 2765fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2766fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2767fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2768fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2769fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2770fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2771fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2774fcdce8c4SStefano Zampini } else { 2775fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2776fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2779fcdce8c4SStefano Zampini } 2780fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2781fcdce8c4SStefano Zampini PetscInt r = 0; 2782fcdce8c4SStefano Zampini c->i[0] = 0; 2783fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2784fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2785fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2786fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2787fcdce8c4SStefano Zampini } 2788fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2789fcdce8c4SStefano Zampini } 27909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 27919566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 27929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 2793fcdce8c4SStefano Zampini c->maxnz = c->nz; 2794fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2795fcdce8c4SStefano Zampini c->rmax = 0; 2796fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2797fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2798fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2799fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2800fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2801fcdce8c4SStefano Zampini } 28029566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 28039566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 2804fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2805fcdce8c4SStefano Zampini 2806fcdce8c4SStefano Zampini C->nonzerostate++; 28079566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 28089566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 2809fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2810fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2811fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2812fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2813fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2814abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2815fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2816fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2817fcdce8c4SStefano Zampini } 2818fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2819fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2820fcdce8c4SStefano Zampini } 2821fcdce8c4SStefano Zampini 2822fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2823fcdce8c4SStefano Zampini 2824fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2825fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2826fcdce8c4SStefano Zampini { 2827fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2828fcdce8c4SStefano Zampini PetscErrorCode ierr; 2829fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2830fcdce8c4SStefano Zampini 2831fcdce8c4SStefano Zampini PetscFunctionBegin; 2832fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 28339566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2834abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 28359566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2836fcdce8c4SStefano Zampini } 2837fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2838fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2839fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 28409566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2841fcdce8c4SStefano Zampini } 2842fcdce8c4SStefano Zampini } 284365e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 284465e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 284565e4b4d4SStefano Zampini switch (product->type) { 284665e4b4d4SStefano Zampini case MATPRODUCT_AB: 284765e4b4d4SStefano Zampini if (product->api_user) { 28489566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr); 28499566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 28509566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 285165e4b4d4SStefano Zampini } else { 28529566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr); 28539566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 28549566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 285565e4b4d4SStefano Zampini } 285665e4b4d4SStefano Zampini break; 285765e4b4d4SStefano Zampini case MATPRODUCT_AtB: 285865e4b4d4SStefano Zampini if (product->api_user) { 28599566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr); 28609566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 28619566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 286265e4b4d4SStefano Zampini } else { 28639566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr); 28649566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 28659566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 286665e4b4d4SStefano Zampini } 286765e4b4d4SStefano Zampini break; 286865e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 286965e4b4d4SStefano Zampini if (product->api_user) { 28709566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr); 28719566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 28729566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 287365e4b4d4SStefano Zampini } else { 28749566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr); 28759566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 28769566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 287765e4b4d4SStefano Zampini } 287865e4b4d4SStefano Zampini break; 287965e4b4d4SStefano Zampini case MATPRODUCT_RARt: 288065e4b4d4SStefano Zampini if (product->api_user) { 28819566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr); 28829566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 28839566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 288465e4b4d4SStefano Zampini } else { 28859566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr); 28869566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 28879566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 288865e4b4d4SStefano Zampini } 288965e4b4d4SStefano Zampini break; 289065e4b4d4SStefano Zampini case MATPRODUCT_ABC: 289165e4b4d4SStefano Zampini if (product->api_user) { 28929566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr); 28939566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 28949566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 289565e4b4d4SStefano Zampini } else { 28969566063dSJacob Faibussowitsch ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr); 28979566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 28989566063dSJacob Faibussowitsch ierr = PetscOptionsEnd();PetscCall(ierr); 289965e4b4d4SStefano Zampini } 290065e4b4d4SStefano Zampini break; 290165e4b4d4SStefano Zampini default: 290265e4b4d4SStefano Zampini break; 290365e4b4d4SStefano Zampini } 290465e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 290565e4b4d4SStefano Zampini } 290665e4b4d4SStefano Zampini /* dispatch */ 2907fcdce8c4SStefano Zampini if (isdense) { 2908ccdfe979SStefano Zampini switch (product->type) { 2909ccdfe979SStefano Zampini case MATPRODUCT_AB: 2910ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2911ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2912ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2913ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2914fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 29159566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2916fcdce8c4SStefano Zampini } else { 2917fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2918fcdce8c4SStefano Zampini } 2919fcdce8c4SStefano Zampini break; 2920fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2921fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2922fcdce8c4SStefano Zampini break; 2923ccdfe979SStefano Zampini default: 2924ccdfe979SStefano Zampini break; 2925ccdfe979SStefano Zampini } 2926fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2927fcdce8c4SStefano Zampini switch (product->type) { 2928fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2929fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2930fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2931fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2932fcdce8c4SStefano Zampini break; 2933fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2934fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2935fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2936fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2937fcdce8c4SStefano Zampini break; 2938fcdce8c4SStefano Zampini default: 2939fcdce8c4SStefano Zampini break; 2940fcdce8c4SStefano Zampini } 2941fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 29429566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2943fcdce8c4SStefano Zampini } 2944ccdfe979SStefano Zampini PetscFunctionReturn(0); 2945ccdfe979SStefano Zampini } 2946ccdfe979SStefano Zampini 29476fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 29489ae82921SPaul Mullowney { 29499ae82921SPaul Mullowney PetscFunctionBegin; 29509566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2951e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2952e6e9a74fSStefano Zampini } 2953e6e9a74fSStefano Zampini 2954e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2955e6e9a74fSStefano Zampini { 2956e6e9a74fSStefano Zampini PetscFunctionBegin; 29579566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2958e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2959e6e9a74fSStefano Zampini } 2960e6e9a74fSStefano Zampini 2961e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2962e6e9a74fSStefano Zampini { 2963e6e9a74fSStefano Zampini PetscFunctionBegin; 29649566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2965e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2966e6e9a74fSStefano Zampini } 2967e6e9a74fSStefano Zampini 2968e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2969e6e9a74fSStefano Zampini { 2970e6e9a74fSStefano Zampini PetscFunctionBegin; 29719566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 29729ae82921SPaul Mullowney PetscFunctionReturn(0); 29739ae82921SPaul Mullowney } 29749ae82921SPaul Mullowney 29756fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2976ca45077fSPaul Mullowney { 2977ca45077fSPaul Mullowney PetscFunctionBegin; 29789566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2979ca45077fSPaul Mullowney PetscFunctionReturn(0); 2980ca45077fSPaul Mullowney } 2981ca45077fSPaul Mullowney 2982a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2983a0e72f99SJunchao Zhang { 2984a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2985a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2986a0e72f99SJunchao Zhang } 2987a0e72f99SJunchao Zhang 2988afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2989e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 29909ae82921SPaul Mullowney { 29919ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2992aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 29939ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2994e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2995e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2996e6e9a74fSStefano Zampini PetscBool compressed; 2997afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2998afb2bd1cSJunchao Zhang PetscInt nx,ny; 2999afb2bd1cSJunchao Zhang #endif 30006e111a19SKarl Rupp 30019ae82921SPaul Mullowney PetscFunctionBegin; 3002*08401ef6SPierre Jolivet PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3003cbc6b225SStefano Zampini if (!a->nz) { 30049566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 30059566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3006e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3007e6e9a74fSStefano Zampini } 300834d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 30099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3010e6e9a74fSStefano Zampini if (!trans) { 30119ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 30125f80ce2aSJacob Faibussowitsch PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3013e6e9a74fSStefano Zampini } else { 30141a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3015e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3016e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3017e6e9a74fSStefano Zampini } else { 30189566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3019e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3020e6e9a74fSStefano Zampini } 3021e6e9a74fSStefano Zampini } 3022e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3023e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3024213423ffSJunchao Zhang 3025e6e9a74fSStefano Zampini try { 30269566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 30279566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 30289566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3029afb2bd1cSJunchao Zhang 30309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3031e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3032afb2bd1cSJunchao Zhang /* z = A x + beta y. 3033afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3034afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3035afb2bd1cSJunchao Zhang */ 3036e6e9a74fSStefano Zampini xptr = xarray; 3037afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3038213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3039afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3040afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3041afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3042afb2bd1cSJunchao Zhang */ 3043afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3044afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3045afb2bd1cSJunchao Zhang nx = mat->num_cols; 3046afb2bd1cSJunchao Zhang ny = mat->num_rows; 3047afb2bd1cSJunchao Zhang } 3048afb2bd1cSJunchao Zhang #endif 3049e6e9a74fSStefano Zampini } else { 3050afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3051afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3052afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3053afb2bd1cSJunchao Zhang */ 3054afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3055e6e9a74fSStefano Zampini dptr = zarray; 3056e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3057afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3058e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3059a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3060e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3061e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3062e6e9a74fSStefano Zampini } 3063afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3064afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3065afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3066afb2bd1cSJunchao Zhang nx = mat->num_rows; 3067afb2bd1cSJunchao Zhang ny = mat->num_cols; 3068afb2bd1cSJunchao Zhang } 3069afb2bd1cSJunchao Zhang #endif 3070e6e9a74fSStefano Zampini } 30719ae82921SPaul Mullowney 3072afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3073aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3074afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 30755f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3076afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 30779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 30789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 30799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3080afb2bd1cSJunchao Zhang matstruct->matDescr, 3081afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3082afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3083afb2bd1cSJunchao Zhang cusparse_scalartype, 3084afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 30855f80ce2aSJacob Faibussowitsch &matstruct->cuSpMV[opA].spmvBufferSize)); 30869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3087afb2bd1cSJunchao Zhang 3088afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3089afb2bd1cSJunchao Zhang } else { 3090afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 30919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 30929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3093afb2bd1cSJunchao Zhang } 3094afb2bd1cSJunchao Zhang 30959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3096afb2bd1cSJunchao Zhang matstruct->alpha_one, 30973606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3098afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3099afb2bd1cSJunchao Zhang beta, 3100afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3101afb2bd1cSJunchao Zhang cusparse_scalartype, 3102afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 31035f80ce2aSJacob Faibussowitsch matstruct->cuSpMV[opA].spmvBuffer)); 3104afb2bd1cSJunchao Zhang #else 31057656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 31069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3107a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3108afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3109aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3110e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 31115f80ce2aSJacob Faibussowitsch dptr)); 3112afb2bd1cSJunchao Zhang #endif 3113aa372e3fSPaul Mullowney } else { 3114213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3115afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3116afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3117afb2bd1cSJunchao Zhang #else 3118301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 31199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3120afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3121e6e9a74fSStefano Zampini xptr, beta, 31225f80ce2aSJacob Faibussowitsch dptr)); 3123afb2bd1cSJunchao Zhang #endif 3124a65300a6SPaul Mullowney } 3125aa372e3fSPaul Mullowney } 31269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3127aa372e3fSPaul Mullowney 3128e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3129213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3130213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 31319566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3132e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 31339566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 31347656d835SStefano Zampini } 3135213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 31369566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz,0)); 31377656d835SStefano Zampini } 31387656d835SStefano Zampini 3139213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3140213423ffSJunchao Zhang if (compressed) { 31419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3142a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3143a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3144a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3145a0e72f99SJunchao Zhang */ 3146a0e72f99SJunchao Zhang #if 0 3147a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3148a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3149a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3150e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3151c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3152a0e72f99SJunchao Zhang #else 3153a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3154a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3155a0e72f99SJunchao Zhang #endif 31569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3157e6e9a74fSStefano Zampini } 3158e6e9a74fSStefano Zampini } else { 3159e6e9a74fSStefano Zampini if (yy && yy != zz) { 31609566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3161e6e9a74fSStefano Zampini } 3162e6e9a74fSStefano Zampini } 31639566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 31649566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 31659566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 31669ae82921SPaul Mullowney } catch(char *ex) { 316798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 31689ae82921SPaul Mullowney } 3169e6e9a74fSStefano Zampini if (yy) { 31709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3171e6e9a74fSStefano Zampini } else { 31729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3173e6e9a74fSStefano Zampini } 31749ae82921SPaul Mullowney PetscFunctionReturn(0); 31759ae82921SPaul Mullowney } 31769ae82921SPaul Mullowney 31776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3178ca45077fSPaul Mullowney { 3179ca45077fSPaul Mullowney PetscFunctionBegin; 31809566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3181ca45077fSPaul Mullowney PetscFunctionReturn(0); 3182ca45077fSPaul Mullowney } 3183ca45077fSPaul Mullowney 31846fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 31859ae82921SPaul Mullowney { 3186042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3187042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 31883fa6b06aSMark Adams 3189042217e8SBarry Smith PetscFunctionBegin; 31909566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3191042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3192042217e8SBarry Smith 31939566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 31949566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3195042217e8SBarry Smith cusp->deviceMat = NULL; 3196042217e8SBarry Smith } 31979ae82921SPaul Mullowney PetscFunctionReturn(0); 31989ae82921SPaul Mullowney } 31999ae82921SPaul Mullowney 32009ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3201e057df02SPaul Mullowney /*@ 32029ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3203e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3204e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3205e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3206e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3207e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32089ae82921SPaul Mullowney 3209d083f849SBarry Smith Collective 32109ae82921SPaul Mullowney 32119ae82921SPaul Mullowney Input Parameters: 32129ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32139ae82921SPaul Mullowney . m - number of rows 32149ae82921SPaul Mullowney . n - number of columns 32159ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32169ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32170298fd71SBarry Smith (possibly different for each row) or NULL 32189ae82921SPaul Mullowney 32199ae82921SPaul Mullowney Output Parameter: 32209ae82921SPaul Mullowney . A - the matrix 32219ae82921SPaul Mullowney 32229ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 32239ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 32249ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 32259ae82921SPaul Mullowney 32269ae82921SPaul Mullowney Notes: 32279ae82921SPaul Mullowney If nnz is given then nz is ignored 32289ae82921SPaul Mullowney 32299ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 32309ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 32319ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 32329ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 32339ae82921SPaul Mullowney 32349ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 32350298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 32369ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 32379ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 32389ae82921SPaul Mullowney 32399ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 32409ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 32419ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 32429ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 32439ae82921SPaul Mullowney 32449ae82921SPaul Mullowney Level: intermediate 32459ae82921SPaul Mullowney 3246e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 32479ae82921SPaul Mullowney @*/ 32489ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 32499ae82921SPaul Mullowney { 32509ae82921SPaul Mullowney PetscFunctionBegin; 32519566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 32529566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 32539566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 32549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 32559ae82921SPaul Mullowney PetscFunctionReturn(0); 32569ae82921SPaul Mullowney } 32579ae82921SPaul Mullowney 32586fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 32599ae82921SPaul Mullowney { 32609ae82921SPaul Mullowney PetscFunctionBegin; 32619ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 32629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 32639ae82921SPaul Mullowney } else { 32649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3265aa372e3fSPaul Mullowney } 32669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 32679566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 32689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 32699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 32709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 32719566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 32729566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 32739566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 32749566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 32759566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 32769566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 32779ae82921SPaul Mullowney PetscFunctionReturn(0); 32789ae82921SPaul Mullowney } 32799ae82921SPaul Mullowney 3280ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 328195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 32829ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 32839ff858a8SKarl Rupp { 32849ff858a8SKarl Rupp PetscFunctionBegin; 32859566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 32869566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 32879ff858a8SKarl Rupp PetscFunctionReturn(0); 32889ff858a8SKarl Rupp } 32899ff858a8SKarl Rupp 3290039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 329195639643SRichard Tran Mills { 3292a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3293039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3294039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3295039c6fbaSStefano Zampini PetscScalar *ay; 3296039c6fbaSStefano Zampini const PetscScalar *ax; 3297039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3298e6e9a74fSStefano Zampini 329995639643SRichard Tran Mills PetscFunctionBegin; 3300a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3301a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3302039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 33039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33049566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3305a587d139SMark PetscFunctionReturn(0); 330695639643SRichard Tran Mills } 3307039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 33089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 33099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 33105f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 33115f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3312039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3313039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3314039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3315039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3316039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3317039c6fbaSStefano Zampini if (eq) { 3318039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3319039c6fbaSStefano Zampini } 3320039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3321039c6fbaSStefano Zampini } 3322d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3323d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3324039c6fbaSStefano Zampini 3325039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3326039c6fbaSStefano Zampini PetscScalar b = 1.0; 3327039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3328039c6fbaSStefano Zampini size_t bufferSize; 3329039c6fbaSStefano Zampini void *buffer; 3330039c6fbaSStefano Zampini #endif 3331039c6fbaSStefano Zampini 33329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3335039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3337039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3338039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33395f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 33409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 33419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3343039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3344039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33455f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 33469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 33479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3349039c6fbaSStefano Zampini #else 33509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3352039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3353039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33545f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 33559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 33569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3357039c6fbaSStefano Zampini #endif 33589566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 33599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3362039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3363a587d139SMark cublasHandle_t cublasv2handle; 3364a587d139SMark PetscBLASInt one = 1, bnz = 1; 3365039c6fbaSStefano Zampini 33669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33689566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 33699566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz,&bnz)); 33709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33719566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 33729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*bnz)); 33739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3377039c6fbaSStefano Zampini } else { 33789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33799566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3380a587d139SMark } 338195639643SRichard Tran Mills PetscFunctionReturn(0); 338295639643SRichard Tran Mills } 338395639643SRichard Tran Mills 338433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 338533c9ba73SStefano Zampini { 338633c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 338733c9ba73SStefano Zampini PetscScalar *ay; 338833c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 338933c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 339033c9ba73SStefano Zampini 339133c9ba73SStefano Zampini PetscFunctionBegin; 33929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33939566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 33949566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz,&bnz)); 33959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33969566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 33979566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 33989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 34009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 340133c9ba73SStefano Zampini PetscFunctionReturn(0); 340233c9ba73SStefano Zampini } 340333c9ba73SStefano Zampini 34043fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34053fa6b06aSMark Adams { 34067e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3407a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34087e8381f9SStefano Zampini 34093fa6b06aSMark Adams PetscFunctionBegin; 34103fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 34113fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 34127e8381f9SStefano Zampini if (spptr->mat) { 34137e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 34147e8381f9SStefano Zampini if (matrix->values) { 34157e8381f9SStefano Zampini both = PETSC_TRUE; 34167e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34177e8381f9SStefano Zampini } 34187e8381f9SStefano Zampini } 34197e8381f9SStefano Zampini if (spptr->matTranspose) { 34207e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 34217e8381f9SStefano Zampini if (matrix->values) { 34227e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34237e8381f9SStefano Zampini } 34247e8381f9SStefano Zampini } 34253fa6b06aSMark Adams } 34269566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 34279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 34287e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3429a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 34303fa6b06aSMark Adams PetscFunctionReturn(0); 34313fa6b06aSMark Adams } 34323fa6b06aSMark Adams 3433a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3434a587d139SMark { 3435a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3436a587d139SMark 3437a587d139SMark PetscFunctionBegin; 34389a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 34399a14fc28SStefano Zampini A->boundtocpu = flg; 34409a14fc28SStefano Zampini PetscFunctionReturn(0); 34419a14fc28SStefano Zampini } 3442a587d139SMark if (flg) { 34439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3444a587d139SMark 344533c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3446a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3447a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3448a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3449a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3450a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3451a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3452a587d139SMark A->ops->multhermitiantranspose = NULL; 3453a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3454fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 34559566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 34569566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 34579566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 34589566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 34599566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 34609566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 34619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 34629566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3463a587d139SMark } else { 346433c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3465a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3466a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3467a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3468a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3469a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3470a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3471a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3472a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3473fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 347467a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 347567a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 347667a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 347767a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 347867a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 347967a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 34807ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 34817ee59b9bSJunchao Zhang 34829566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 34839566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34849566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34859566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 34869566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 34879566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3488a587d139SMark } 3489a587d139SMark A->boundtocpu = flg; 3490ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3491ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3492ea500dcfSRichard Tran Mills } else { 3493ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3494ea500dcfSRichard Tran Mills } 3495a587d139SMark PetscFunctionReturn(0); 3496a587d139SMark } 3497a587d139SMark 349849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 34999ae82921SPaul Mullowney { 350049735bf3SStefano Zampini Mat B; 35019ae82921SPaul Mullowney 35029ae82921SPaul Mullowney PetscFunctionBegin; 35039566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 350449735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 35059566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 350649735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 35079566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 350849735bf3SStefano Zampini } 350949735bf3SStefano Zampini B = *newmat; 351049735bf3SStefano Zampini 35119566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 35129566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 351334136279SStefano Zampini 351449735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 35159ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3516e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 35179566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 35189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 35199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 35201a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3521d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 35228efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3523a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3524a435da06SStefano Zampini #else 3525d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3526a435da06SStefano Zampini #endif 3527d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3528d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3529d8132acaSStefano Zampini #endif 35301a2c6b5cSJunchao Zhang B->spptr = spptr; 35319ae82921SPaul Mullowney } else { 3532e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3533e6e9a74fSStefano Zampini 35349566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 35359566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 35369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3537e6e9a74fSStefano Zampini B->spptr = spptr; 35389ae82921SPaul Mullowney } 3539e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 354049735bf3SStefano Zampini } 3541693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 35429ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 35431a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 35449ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 354595639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3546693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 35472205254eSKarl Rupp 35489566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 35499566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 35509566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3551ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 35529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3553ae48a8d0SStefano Zampini #endif 35549566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 35559ae82921SPaul Mullowney PetscFunctionReturn(0); 35569ae82921SPaul Mullowney } 35579ae82921SPaul Mullowney 355802fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 355902fe1965SBarry Smith { 356002fe1965SBarry Smith PetscFunctionBegin; 35619566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 35629566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 356302fe1965SBarry Smith PetscFunctionReturn(0); 356402fe1965SBarry Smith } 356502fe1965SBarry Smith 35663ca39a21SBarry Smith /*MC 3567e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3568e057df02SPaul Mullowney 3569e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 35702692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 35712692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3572e057df02SPaul Mullowney 3573e057df02SPaul Mullowney Options Database Keys: 3574e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3575aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3576a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3577365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3578e057df02SPaul Mullowney 3579e057df02SPaul Mullowney Level: beginner 3580e057df02SPaul Mullowney 35818468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3582e057df02SPaul Mullowney M*/ 35837f756511SDominic Meiser 3584bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 35850f39cd5aSBarry Smith 35863ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 358742c9c57cSBarry Smith { 358842c9c57cSBarry Smith PetscFunctionBegin; 35899566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 35909566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 35919566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 35929566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 35939566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3594bddcd29dSMark Adams 359542c9c57cSBarry Smith PetscFunctionReturn(0); 359642c9c57cSBarry Smith } 359729b38603SBarry Smith 3598cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3599cbc6b225SStefano Zampini { 3600cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3601cbc6b225SStefano Zampini 3602cbc6b225SStefano Zampini PetscFunctionBegin; 3603cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3604cbc6b225SStefano Zampini delete cusp->cooPerm; 3605cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3606cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3607cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3608cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 36099566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 36109566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 3611cbc6b225SStefano Zampini } 3612cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3613cbc6b225SStefano Zampini PetscFunctionReturn(0); 3614cbc6b225SStefano Zampini } 3615cbc6b225SStefano Zampini 3616470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 36177f756511SDominic Meiser { 36187f756511SDominic Meiser PetscFunctionBegin; 36197f756511SDominic Meiser if (*cusparsestruct) { 36209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 36219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 36227f756511SDominic Meiser delete (*cusparsestruct)->workVector; 362381902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 36247e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 36257e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3626a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 36279566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 36289566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 36299566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 36309566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 36317f756511SDominic Meiser } 36327f756511SDominic Meiser PetscFunctionReturn(0); 36337f756511SDominic Meiser } 36347f756511SDominic Meiser 36357f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 36367f756511SDominic Meiser { 36377f756511SDominic Meiser PetscFunctionBegin; 36387f756511SDominic Meiser if (*mat) { 36397f756511SDominic Meiser delete (*mat)->values; 36407f756511SDominic Meiser delete (*mat)->column_indices; 36417f756511SDominic Meiser delete (*mat)->row_offsets; 36427f756511SDominic Meiser delete *mat; 36437f756511SDominic Meiser *mat = 0; 36447f756511SDominic Meiser } 36457f756511SDominic Meiser PetscFunctionReturn(0); 36467f756511SDominic Meiser } 36477f756511SDominic Meiser 3648470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 36497f756511SDominic Meiser { 36507f756511SDominic Meiser PetscFunctionBegin; 36517f756511SDominic Meiser if (*trifactor) { 36529566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 36539566063dSJacob Faibussowitsch if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 36549566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 36559566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 36569566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3657afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 36589566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3659afb2bd1cSJunchao Zhang #endif 36609566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 36617f756511SDominic Meiser } 36627f756511SDominic Meiser PetscFunctionReturn(0); 36637f756511SDominic Meiser } 36647f756511SDominic Meiser 3665470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 36667f756511SDominic Meiser { 36677f756511SDominic Meiser CsrMatrix *mat; 36687f756511SDominic Meiser 36697f756511SDominic Meiser PetscFunctionBegin; 36707f756511SDominic Meiser if (*matstruct) { 36717f756511SDominic Meiser if ((*matstruct)->mat) { 36727f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3673afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3674afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3675afb2bd1cSJunchao Zhang #else 36767f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 36779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3678afb2bd1cSJunchao Zhang #endif 36797f756511SDominic Meiser } else { 36807f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 36817f756511SDominic Meiser CsrMatrix_Destroy(&mat); 36827f756511SDominic Meiser } 36837f756511SDominic Meiser } 36849566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 36857f756511SDominic Meiser delete (*matstruct)->cprowIndices; 36869566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 36879566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 36889566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3689afb2bd1cSJunchao Zhang 3690afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3691afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 36929566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3693afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3694afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 36959566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 36969566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 36979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3698afb2bd1cSJunchao Zhang } 3699afb2bd1cSJunchao Zhang } 3700afb2bd1cSJunchao Zhang #endif 37017f756511SDominic Meiser delete *matstruct; 37027e8381f9SStefano Zampini *matstruct = NULL; 37037f756511SDominic Meiser } 37047f756511SDominic Meiser PetscFunctionReturn(0); 37057f756511SDominic Meiser } 37067f756511SDominic Meiser 3707e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 37087f756511SDominic Meiser { 37097f756511SDominic Meiser PetscFunctionBegin; 37107f756511SDominic Meiser if (*trifactors) { 37119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 37129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 37139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 37149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 37157f756511SDominic Meiser delete (*trifactors)->rpermIndices; 37167f756511SDominic Meiser delete (*trifactors)->cpermIndices; 37177f756511SDominic Meiser delete (*trifactors)->workVector; 37187e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 37197e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 37207e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 37219566063dSJacob Faibussowitsch if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 37229566063dSJacob Faibussowitsch if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3723e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3724ccdfe979SStefano Zampini } 3725ccdfe979SStefano Zampini PetscFunctionReturn(0); 3726ccdfe979SStefano Zampini } 3727ccdfe979SStefano Zampini 3728ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3729ccdfe979SStefano Zampini { 3730ccdfe979SStefano Zampini cusparseHandle_t handle; 3731ccdfe979SStefano Zampini 3732ccdfe979SStefano Zampini PetscFunctionBegin; 3733ccdfe979SStefano Zampini if (*trifactors) { 37349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 37357f756511SDominic Meiser if (handle = (*trifactors)->handle) { 37369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroy(handle)); 37377f756511SDominic Meiser } 37389566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 37397f756511SDominic Meiser } 37407f756511SDominic Meiser PetscFunctionReturn(0); 37417f756511SDominic Meiser } 37427e8381f9SStefano Zampini 37437e8381f9SStefano Zampini struct IJCompare 37447e8381f9SStefano Zampini { 37457e8381f9SStefano Zampini __host__ __device__ 37467e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37477e8381f9SStefano Zampini { 37487e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 37497e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 37507e8381f9SStefano Zampini return false; 37517e8381f9SStefano Zampini } 37527e8381f9SStefano Zampini }; 37537e8381f9SStefano Zampini 37547e8381f9SStefano Zampini struct IJEqual 37557e8381f9SStefano Zampini { 37567e8381f9SStefano Zampini __host__ __device__ 37577e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37587e8381f9SStefano Zampini { 37597e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 37607e8381f9SStefano Zampini return true; 37617e8381f9SStefano Zampini } 37627e8381f9SStefano Zampini }; 37637e8381f9SStefano Zampini 37647e8381f9SStefano Zampini struct IJDiff 37657e8381f9SStefano Zampini { 37667e8381f9SStefano Zampini __host__ __device__ 37677e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37687e8381f9SStefano Zampini { 37697e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 37707e8381f9SStefano Zampini } 37717e8381f9SStefano Zampini }; 37727e8381f9SStefano Zampini 37737e8381f9SStefano Zampini struct IJSum 37747e8381f9SStefano Zampini { 37757e8381f9SStefano Zampini __host__ __device__ 37767e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37777e8381f9SStefano Zampini { 37787e8381f9SStefano Zampini return t1||t2; 37797e8381f9SStefano Zampini } 37807e8381f9SStefano Zampini }; 37817e8381f9SStefano Zampini 37827e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3783219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3784219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 37857e8381f9SStefano Zampini { 37867e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3787fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3788bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 378908391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 37907e8381f9SStefano Zampini CsrMatrix *matrix; 37917e8381f9SStefano Zampini PetscInt n; 37927e8381f9SStefano Zampini 37937e8381f9SStefano Zampini PetscFunctionBegin; 379428b400f6SJacob Faibussowitsch PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 379528b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 37967e8381f9SStefano Zampini if (!cusp->cooPerm) { 37979566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 37989566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 37997e8381f9SStefano Zampini PetscFunctionReturn(0); 38007e8381f9SStefano Zampini } 38017e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 380228b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3803e61fc153SStefano Zampini if (!v) { 3804e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3805e61fc153SStefano Zampini goto finalize; 38067e8381f9SStefano Zampini } 3807e61fc153SStefano Zampini n = cusp->cooPerm->size(); 380808391a17SStefano Zampini if (isCudaMem(v)) { 380908391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 381008391a17SStefano Zampini } else { 3811e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3812e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 381308391a17SStefano Zampini d_v = cooPerm_v->data(); 38149566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 381508391a17SStefano Zampini } 38169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3817e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3818ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3819bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 382008391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3821ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3822ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3823ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3824ddea5d60SJunchao Zhang */ 3825e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3826e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3827e61fc153SStefano Zampini delete cooPerm_w; 38287e8381f9SStefano Zampini } else { 3829ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 383008391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38317e8381f9SStefano Zampini matrix->values->begin())); 383208391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38337e8381f9SStefano Zampini matrix->values->end())); 3834ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 38357e8381f9SStefano Zampini } 38367e8381f9SStefano Zampini } else { 3837e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 383808391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3839e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 38407e8381f9SStefano Zampini } else { 384108391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38427e8381f9SStefano Zampini matrix->values->begin())); 384308391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38447e8381f9SStefano Zampini matrix->values->end())); 38457e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 38467e8381f9SStefano Zampini } 38477e8381f9SStefano Zampini } 38489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3849e61fc153SStefano Zampini finalize: 3850e61fc153SStefano Zampini delete cooPerm_v; 38517e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 38529566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3853fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 38549566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 38559566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 38569566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3857fcdce8c4SStefano Zampini a->reallocs = 0; 3858fcdce8c4SStefano Zampini A->info.mallocs += 0; 3859fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3860fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3861fcdce8c4SStefano Zampini A->num_ass++; 38627e8381f9SStefano Zampini PetscFunctionReturn(0); 38637e8381f9SStefano Zampini } 38647e8381f9SStefano Zampini 3865a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3866a49f1ed0SStefano Zampini { 3867a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3868a49f1ed0SStefano Zampini 3869a49f1ed0SStefano Zampini PetscFunctionBegin; 3870a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3871a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3872a49f1ed0SStefano Zampini if (destroy) { 38739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3874a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3875a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3876a49f1ed0SStefano Zampini } 38771a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3878a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3879a49f1ed0SStefano Zampini } 3880a49f1ed0SStefano Zampini 38817e8381f9SStefano Zampini #include <thrust/binary_search.h> 3882219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3883219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 38847e8381f9SStefano Zampini { 38857e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 38867e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 38877e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 38887e8381f9SStefano Zampini 38897e8381f9SStefano Zampini PetscFunctionBegin; 38909566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 38919566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 38927e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 38937e8381f9SStefano Zampini if (n != cooPerm_n) { 38947e8381f9SStefano Zampini delete cusp->cooPerm; 38957e8381f9SStefano Zampini delete cusp->cooPerm_a; 38967e8381f9SStefano Zampini cusp->cooPerm = NULL; 38977e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 38987e8381f9SStefano Zampini } 38997e8381f9SStefano Zampini if (n) { 39007e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 39017e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 39027e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 39037e8381f9SStefano Zampini 39047e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 39057e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 39067e8381f9SStefano Zampini 39079566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 39087e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 39097e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 3910ddea5d60SJunchao Zhang 3911ddea5d60SJunchao Zhang /* Ex. 3912ddea5d60SJunchao Zhang n = 6 3913ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 3914ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 3915ddea5d60SJunchao Zhang */ 39167e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 39177e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 39187e8381f9SStefano Zampini 39199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39207e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3921ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3922ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 39237e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 39247e8381f9SStefano Zampini 3925ddea5d60SJunchao Zhang /* 3926ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 3927ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 3928ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 3929ddea5d60SJunchao Zhang */ 3930ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3931ddea5d60SJunchao Zhang 3932ddea5d60SJunchao Zhang /* 3933ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 3934ddea5d60SJunchao Zhang ^ekey 3935ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 3936ddea5d60SJunchao Zhang ^nekye 3937ddea5d60SJunchao Zhang */ 39387e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 39397e8381f9SStefano Zampini delete cusp->cooPerm_a; 39407e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 3941ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3942ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3943ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3944ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3945ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 39467e8381f9SStefano Zampini w[0] = 0; 3947ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3948ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 39497e8381f9SStefano Zampini } 39507e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 3951ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3952ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3953ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 39549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39557e8381f9SStefano Zampini 39569566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 39577e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 39587e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 39597e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 39609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3961ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 39629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39637e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3964fcdce8c4SStefano Zampini a->rmax = 0; 39659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->a)); 39669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->j)); 39679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39689566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 39699566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 39707e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 39717e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 39727e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 39737e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3974fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 39757e8381f9SStefano Zampini } 3976fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 39777e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 39789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 39799566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 39807e8381f9SStefano Zampini } else { 39819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 39827e8381f9SStefano Zampini } 39839566063dSJacob Faibussowitsch PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 39847e8381f9SStefano Zampini 39857e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3986e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 39879566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->nz)); 39889566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 39897e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 39909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 39919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 39927e8381f9SStefano Zampini PetscFunctionReturn(0); 39937e8381f9SStefano Zampini } 3994ed502f03SStefano Zampini 3995219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3996219fbbafSJunchao Zhang { 3997219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 3998219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 3999cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4000219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4001219fbbafSJunchao Zhang 4002219fbbafSJunchao Zhang PetscFunctionBegin; 40039566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 40049566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4005219fbbafSJunchao Zhang if (coo_i) { 40069566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 4007219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4008219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { 4009cbc6b225SStefano Zampini if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4010219fbbafSJunchao Zhang } 4011219fbbafSJunchao Zhang } 4012219fbbafSJunchao Zhang } 4013219fbbafSJunchao Zhang 4014219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 40159566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4016219fbbafSJunchao Zhang } else { 40179566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4018cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 40199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4020219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ*>(mat->data); 4021219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 40229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 40239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 40249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 40259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4026219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4027219fbbafSJunchao Zhang } 4028219fbbafSJunchao Zhang PetscFunctionReturn(0); 4029219fbbafSJunchao Zhang } 4030219fbbafSJunchao Zhang 4031b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4032219fbbafSJunchao Zhang { 4033219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4034219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4035b6c38306SJunchao Zhang for (; i<nnz; i+= grid_size) { 4036b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4037b6c38306SJunchao Zhang for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4038b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4039b6c38306SJunchao Zhang } 4040219fbbafSJunchao Zhang } 4041219fbbafSJunchao Zhang 4042219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4043219fbbafSJunchao Zhang { 4044219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4045219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4046219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4047219fbbafSJunchao Zhang PetscMemType memtype; 4048219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4049219fbbafSJunchao Zhang PetscScalar *Aa; 4050219fbbafSJunchao Zhang 4051219fbbafSJunchao Zhang PetscFunctionBegin; 4052219fbbafSJunchao Zhang if (dev->use_extended_coo) { 40539566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 4054219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 40559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 40569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4057219fbbafSJunchao Zhang } 4058219fbbafSJunchao Zhang 40599566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 40609566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4061219fbbafSJunchao Zhang 4062cbc6b225SStefano Zampini if (Annz) { 4063b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 40649566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4065cbc6b225SStefano Zampini } 4066219fbbafSJunchao Zhang 40679566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 40689566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4069219fbbafSJunchao Zhang 40709566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4071219fbbafSJunchao Zhang } else { 40729566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4073219fbbafSJunchao Zhang } 4074219fbbafSJunchao Zhang PetscFunctionReturn(0); 4075219fbbafSJunchao Zhang } 4076219fbbafSJunchao Zhang 40775b7e41feSStefano Zampini /*@C 40785b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 40795b7e41feSStefano Zampini 40805b7e41feSStefano Zampini Not collective 40815b7e41feSStefano Zampini 40825b7e41feSStefano Zampini Input Parameters: 40835b7e41feSStefano Zampini + A - the matrix 40845b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40855b7e41feSStefano Zampini 40865b7e41feSStefano Zampini Output Parameters: 40875b7e41feSStefano Zampini + ia - the CSR row pointers 40885b7e41feSStefano Zampini - ja - the CSR column indices 40895b7e41feSStefano Zampini 40905b7e41feSStefano Zampini Level: developer 40915b7e41feSStefano Zampini 40925b7e41feSStefano Zampini Notes: 40935b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 40945b7e41feSStefano Zampini 40955b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 40965b7e41feSStefano Zampini @*/ 40975f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 40985f101d05SStefano Zampini { 40995f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 41005f101d05SStefano Zampini CsrMatrix *csr; 41015f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41025f101d05SStefano Zampini 41035f101d05SStefano Zampini PetscFunctionBegin; 41045f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41055f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 41065f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41072c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 410928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 41105f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 41115f101d05SStefano Zampini if (i) { 41125f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 41135f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 41145f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 41155f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 41169566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 41175f101d05SStefano Zampini } 41185f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 41195f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 41205f101d05SStefano Zampini } 41215f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 41225f101d05SStefano Zampini PetscFunctionReturn(0); 41235f101d05SStefano Zampini } 41245f101d05SStefano Zampini 41255b7e41feSStefano Zampini /*@C 41265b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 41275b7e41feSStefano Zampini 41285b7e41feSStefano Zampini Not collective 41295b7e41feSStefano Zampini 41305b7e41feSStefano Zampini Input Parameters: 41315b7e41feSStefano Zampini + A - the matrix 41325b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 41335b7e41feSStefano Zampini 41345b7e41feSStefano Zampini Output Parameters: 41355b7e41feSStefano Zampini + ia - the CSR row pointers 41365b7e41feSStefano Zampini - ja - the CSR column indices 41375b7e41feSStefano Zampini 41385b7e41feSStefano Zampini Level: developer 41395b7e41feSStefano Zampini 41405b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 41415b7e41feSStefano Zampini @*/ 41425f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41435f101d05SStefano Zampini { 41445f101d05SStefano Zampini PetscFunctionBegin; 41455f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41465f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41475f101d05SStefano Zampini if (i) *i = NULL; 41485f101d05SStefano Zampini if (j) *j = NULL; 41495f101d05SStefano Zampini PetscFunctionReturn(0); 41505f101d05SStefano Zampini } 41515f101d05SStefano Zampini 41525b7e41feSStefano Zampini /*@C 41535b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41545b7e41feSStefano Zampini 41555b7e41feSStefano Zampini Not Collective 41565b7e41feSStefano Zampini 41575b7e41feSStefano Zampini Input Parameter: 41585b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41595b7e41feSStefano Zampini 41605b7e41feSStefano Zampini Output Parameter: 41615b7e41feSStefano Zampini . a - pointer to the device data 41625b7e41feSStefano Zampini 41635b7e41feSStefano Zampini Level: developer 41645b7e41feSStefano Zampini 41655b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41665b7e41feSStefano Zampini 41675b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 41685b7e41feSStefano Zampini @*/ 4169ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4170ed502f03SStefano Zampini { 4171ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4172ed502f03SStefano Zampini CsrMatrix *csr; 4173ed502f03SStefano Zampini 4174ed502f03SStefano Zampini PetscFunctionBegin; 4175ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4176ed502f03SStefano Zampini PetscValidPointer(a,2); 4177ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41782c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 418028b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4181ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 418228b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4183ed502f03SStefano Zampini *a = csr->values->data().get(); 4184ed502f03SStefano Zampini PetscFunctionReturn(0); 4185ed502f03SStefano Zampini } 4186ed502f03SStefano Zampini 41875b7e41feSStefano Zampini /*@C 41885b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 41895b7e41feSStefano Zampini 41905b7e41feSStefano Zampini Not Collective 41915b7e41feSStefano Zampini 41925b7e41feSStefano Zampini Input Parameter: 41935b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41945b7e41feSStefano Zampini 41955b7e41feSStefano Zampini Output Parameter: 41965b7e41feSStefano Zampini . a - pointer to the device data 41975b7e41feSStefano Zampini 41985b7e41feSStefano Zampini Level: developer 41995b7e41feSStefano Zampini 42005b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 42015b7e41feSStefano Zampini @*/ 4202ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4203ed502f03SStefano Zampini { 4204ed502f03SStefano Zampini PetscFunctionBegin; 4205ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4206ed502f03SStefano Zampini PetscValidPointer(a,2); 4207ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4208ed502f03SStefano Zampini *a = NULL; 4209ed502f03SStefano Zampini PetscFunctionReturn(0); 4210ed502f03SStefano Zampini } 4211ed502f03SStefano Zampini 42125b7e41feSStefano Zampini /*@C 42135b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42145b7e41feSStefano Zampini 42155b7e41feSStefano Zampini Not Collective 42165b7e41feSStefano Zampini 42175b7e41feSStefano Zampini Input Parameter: 42185b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42195b7e41feSStefano Zampini 42205b7e41feSStefano Zampini Output Parameter: 42215b7e41feSStefano Zampini . a - pointer to the device data 42225b7e41feSStefano Zampini 42235b7e41feSStefano Zampini Level: developer 42245b7e41feSStefano Zampini 42255b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 42265b7e41feSStefano Zampini 42275b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 42285b7e41feSStefano Zampini @*/ 4229039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4230039c6fbaSStefano Zampini { 4231039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4232039c6fbaSStefano Zampini CsrMatrix *csr; 4233039c6fbaSStefano Zampini 4234039c6fbaSStefano Zampini PetscFunctionBegin; 4235039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4236039c6fbaSStefano Zampini PetscValidPointer(a,2); 4237039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42382c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 42399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 424028b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4241039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 424228b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4243039c6fbaSStefano Zampini *a = csr->values->data().get(); 4244039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 42459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4246039c6fbaSStefano Zampini PetscFunctionReturn(0); 4247039c6fbaSStefano Zampini } 42485b7e41feSStefano Zampini /*@C 42495b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4250039c6fbaSStefano Zampini 42515b7e41feSStefano Zampini Not Collective 42525b7e41feSStefano Zampini 42535b7e41feSStefano Zampini Input Parameter: 42545b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42555b7e41feSStefano Zampini 42565b7e41feSStefano Zampini Output Parameter: 42575b7e41feSStefano Zampini . a - pointer to the device data 42585b7e41feSStefano Zampini 42595b7e41feSStefano Zampini Level: developer 42605b7e41feSStefano Zampini 42615b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 42625b7e41feSStefano Zampini @*/ 4263039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4264039c6fbaSStefano Zampini { 4265039c6fbaSStefano Zampini PetscFunctionBegin; 4266039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4267039c6fbaSStefano Zampini PetscValidPointer(a,2); 4268039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 42709566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4271039c6fbaSStefano Zampini *a = NULL; 4272039c6fbaSStefano Zampini PetscFunctionReturn(0); 4273039c6fbaSStefano Zampini } 4274039c6fbaSStefano Zampini 42755b7e41feSStefano Zampini /*@C 42765b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42775b7e41feSStefano Zampini 42785b7e41feSStefano Zampini Not Collective 42795b7e41feSStefano Zampini 42805b7e41feSStefano Zampini Input Parameter: 42815b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42825b7e41feSStefano Zampini 42835b7e41feSStefano Zampini Output Parameter: 42845b7e41feSStefano Zampini . a - pointer to the device data 42855b7e41feSStefano Zampini 42865b7e41feSStefano Zampini Level: developer 42875b7e41feSStefano Zampini 42885b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 42895b7e41feSStefano Zampini 42905b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 42915b7e41feSStefano Zampini @*/ 4292ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4293ed502f03SStefano Zampini { 4294ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4295ed502f03SStefano Zampini CsrMatrix *csr; 4296ed502f03SStefano Zampini 4297ed502f03SStefano Zampini PetscFunctionBegin; 4298ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4299ed502f03SStefano Zampini PetscValidPointer(a,2); 4300ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 43012c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 430228b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4303ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 430428b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4305ed502f03SStefano Zampini *a = csr->values->data().get(); 4306039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 43079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4308ed502f03SStefano Zampini PetscFunctionReturn(0); 4309ed502f03SStefano Zampini } 4310ed502f03SStefano Zampini 43115b7e41feSStefano Zampini /*@C 43125b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 43135b7e41feSStefano Zampini 43145b7e41feSStefano Zampini Not Collective 43155b7e41feSStefano Zampini 43165b7e41feSStefano Zampini Input Parameter: 43175b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43185b7e41feSStefano Zampini 43195b7e41feSStefano Zampini Output Parameter: 43205b7e41feSStefano Zampini . a - pointer to the device data 43215b7e41feSStefano Zampini 43225b7e41feSStefano Zampini Level: developer 43235b7e41feSStefano Zampini 43245b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 43255b7e41feSStefano Zampini @*/ 4326ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4327ed502f03SStefano Zampini { 4328ed502f03SStefano Zampini PetscFunctionBegin; 4329ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4330ed502f03SStefano Zampini PetscValidPointer(a,2); 4331ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 43329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 43339566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4334ed502f03SStefano Zampini *a = NULL; 4335ed502f03SStefano Zampini PetscFunctionReturn(0); 4336ed502f03SStefano Zampini } 4337ed502f03SStefano Zampini 4338ed502f03SStefano Zampini struct IJCompare4 4339ed502f03SStefano Zampini { 4340ed502f03SStefano Zampini __host__ __device__ 43412ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4342ed502f03SStefano Zampini { 4343ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4344ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4345ed502f03SStefano Zampini return false; 4346ed502f03SStefano Zampini } 4347ed502f03SStefano Zampini }; 4348ed502f03SStefano Zampini 43498909a122SStefano Zampini struct Shift 43508909a122SStefano Zampini { 4351ed502f03SStefano Zampini int _shift; 4352ed502f03SStefano Zampini 4353ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4354ed502f03SStefano Zampini __host__ __device__ 4355ed502f03SStefano Zampini inline int operator() (const int &c) 4356ed502f03SStefano Zampini { 4357ed502f03SStefano Zampini return c + _shift; 4358ed502f03SStefano Zampini } 4359ed502f03SStefano Zampini }; 4360ed502f03SStefano Zampini 4361ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4362ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4363ed502f03SStefano Zampini { 4364ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4365ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4366ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4367ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4368ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4369ed502f03SStefano Zampini cusparseStatus_t stat; 4370ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4371ed502f03SStefano Zampini 4372ed502f03SStefano Zampini PetscFunctionBegin; 4373ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4374ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4375ed502f03SStefano Zampini PetscValidPointer(C,4); 4376ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4377ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 43785f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4379*08401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 43802c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 43812c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4382ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4383ed502f03SStefano Zampini m = A->rmap->n; 4384ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 43859566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,C)); 43869566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C,m,n,m,n)); 43879566063dSJacob Faibussowitsch PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4388ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4389ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4390ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4391ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4392ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4393ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4394ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4395ed502f03SStefano Zampini c->compressedrow.i = NULL; 4396ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4397ed502f03SStefano Zampini Ccusp->workVector = NULL; 4398ed502f03SStefano Zampini Ccusp->nrows = m; 4399ed502f03SStefano Zampini Ccusp->mat = Cmat; 4400ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4401ed502f03SStefano Zampini Ccsr->num_rows = m; 4402ed502f03SStefano Zampini Ccsr->num_cols = n; 44039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 44049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 44059566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 44069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 44079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 44089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 44099566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44119566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 44139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 441428b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 441528b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4416ed502f03SStefano Zampini 4417ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4418ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4419ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4420ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4421ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4422ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4423ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4424ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4425ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4426ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4427ed502f03SStefano Zampini if (c->nz) { 44282ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 44292ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 44302ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 44312ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 44322ed87e7eSStefano Zampini 4433ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4434ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4435ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4436ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 44379566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4438ed502f03SStefano Zampini } 44392ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 44402ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4441ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4442ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4443ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4444ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 44459566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4446ed502f03SStefano Zampini } 44472ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 44482ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 44499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 44502ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 44512ed87e7eSStefano Zampini Aroff->data().get(), 44522ed87e7eSStefano Zampini Annz, 44532ed87e7eSStefano Zampini m, 44542ed87e7eSStefano Zampini Acoo->data().get(), 44559566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4456ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 44572ed87e7eSStefano Zampini Broff->data().get(), 4458ed502f03SStefano Zampini Bnnz, 4459ed502f03SStefano Zampini m, 44602ed87e7eSStefano Zampini Bcoo->data().get(), 44619566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 44622ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 44632ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 44642ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 44658909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4466ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4467ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 44688909a122SStefano Zampini #else 44698909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 44708909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 44718909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 44728909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 44738909a122SStefano Zampini #endif 44742ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 44752ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 44762ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 44772ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 44782ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 44792ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4480ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4481ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4482ed502f03SStefano Zampini thrust::advance(p2,Annz); 44832ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 44848909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 44858909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 44868909a122SStefano Zampini #endif 44872ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 44882ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 44892ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 44902ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 44912ed87e7eSStefano Zampini #else 44922ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 44932ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 44942ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 44952ed87e7eSStefano Zampini #endif 4496ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 44972ed87e7eSStefano Zampini Ccoo->data().get(), 4498ed502f03SStefano Zampini c->nz, 4499ed502f03SStefano Zampini m, 4500ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 45019566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 45029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 45032ed87e7eSStefano Zampini delete wPerm; 45042ed87e7eSStefano Zampini delete Acoo; 45052ed87e7eSStefano Zampini delete Bcoo; 45062ed87e7eSStefano Zampini delete Ccoo; 4507ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4508ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4509ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4510ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 45119566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4512ed502f03SStefano Zampini #endif 45131a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 45149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 45159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4516ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4517ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4518ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4519ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4520ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4521ed502f03SStefano Zampini 45221a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 45231a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4524a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4525ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4526ed502f03SStefano Zampini CmatT->mat = CcsrT; 4527ed502f03SStefano Zampini CcsrT->num_rows = n; 4528ed502f03SStefano Zampini CcsrT->num_cols = m; 4529ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4530ed502f03SStefano Zampini 4531ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4532ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4533ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4534ed502f03SStefano Zampini 45359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4536ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4537ed502f03SStefano Zampini if (AT) { 4538ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4539ed502f03SStefano Zampini thrust::advance(rT,-1); 4540ed502f03SStefano Zampini } 4541ed502f03SStefano Zampini if (BT) { 4542ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4543ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4544ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4545ed502f03SStefano Zampini } 4546ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4547ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4548ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4549ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4550ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4551ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 45529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4553ed502f03SStefano Zampini 45549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 45559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 45569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 45579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 45589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 45599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 45609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4563ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4564ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4565ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4566ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 45679566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4568ed502f03SStefano Zampini #endif 4569ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4570ed502f03SStefano Zampini } 4571ed502f03SStefano Zampini } 4572ed502f03SStefano Zampini 4573ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4574ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4575ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 45769566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 45779566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 4578ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4579ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4580ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4581ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4582ed502f03SStefano Zampini jj = *Ccsr->column_indices; 45839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4585ed502f03SStefano Zampini } else { 45869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4588ed502f03SStefano Zampini } 45899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 45909566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 45919566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 4592ed502f03SStefano Zampini c->maxnz = c->nz; 4593ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4594ed502f03SStefano Zampini c->rmax = 0; 4595ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4596ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4597ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4598ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4599ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4600ed502f03SStefano Zampini } 46019566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 46029566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 4603ed502f03SStefano Zampini (*C)->nonzerostate++; 46049566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 46059566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4606ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4607ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4608ed502f03SStefano Zampini } else { 4609*08401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4610ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4611ed502f03SStefano Zampini if (c->nz) { 4612ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 46135f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 46142c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4615*08401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 46169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 46179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 46185f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 46195f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4620ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4621ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4622ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 46232c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 46242c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 46252c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 46262c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 46275f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4628ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4629ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 46309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4631ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4632ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4633ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4634ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4635ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4636ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4637ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4638ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4639ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4640ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 46419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 46421a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 46435f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4644ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4645ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4646ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4647ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4648ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4649ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4650ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 46511a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4652ed502f03SStefano Zampini } 46539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4654ed502f03SStefano Zampini } 4655ed502f03SStefano Zampini } 46569566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4657ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4658ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4659ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4660ed502f03SStefano Zampini PetscFunctionReturn(0); 4661ed502f03SStefano Zampini } 4662c215019aSStefano Zampini 4663c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4664c215019aSStefano Zampini { 4665c215019aSStefano Zampini bool dmem; 4666c215019aSStefano Zampini const PetscScalar *av; 4667c215019aSStefano Zampini 4668c215019aSStefano Zampini PetscFunctionBegin; 4669c215019aSStefano Zampini dmem = isCudaMem(v); 46709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4671c215019aSStefano Zampini if (n && idx) { 4672c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4673c215019aSStefano Zampini widx.assign(idx,idx+n); 46749566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4675c215019aSStefano Zampini 4676c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4677c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4678c215019aSStefano Zampini if (dmem) { 4679c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4680c215019aSStefano Zampini } else { 4681c215019aSStefano Zampini w = new THRUSTARRAY(n); 4682c215019aSStefano Zampini dv = w->data(); 4683c215019aSStefano Zampini } 4684c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4685c215019aSStefano Zampini 4686c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4687c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4688c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4689c215019aSStefano Zampini if (w) { 46909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4691c215019aSStefano Zampini } 4692c215019aSStefano Zampini delete w; 4693c215019aSStefano Zampini } else { 46949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4695c215019aSStefano Zampini } 46969566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 46979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4698c215019aSStefano Zampini PetscFunctionReturn(0); 4699c215019aSStefano Zampini } 4700