19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94c215019aSStefano Zampini 95ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 969ae82921SPaul Mullowney { 979ae82921SPaul Mullowney PetscFunctionBegin; 989ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 999ae82921SPaul Mullowney PetscFunctionReturn(0); 1009ae82921SPaul Mullowney } 1019ae82921SPaul Mullowney 102c708e6cdSJed Brown /*MC 103087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 109c708e6cdSJed Brown 1109ae82921SPaul Mullowney Level: beginner 111c708e6cdSJed Brown 1123ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 113c708e6cdSJed Brown M*/ 1149ae82921SPaul Mullowney 11542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1169ae82921SPaul Mullowney { 117bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1189ae82921SPaul Mullowney 1199ae82921SPaul Mullowney PetscFunctionBegin; 1209566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1219566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B,n,n,n,n)); 1222c7c0729SBarry Smith (*B)->factortype = ftype; 1239566063dSJacob Faibussowitsch PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 1242205254eSKarl Rupp 1259566063dSJacob Faibussowitsch if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1279566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1289c1083e7SRichard Tran Mills if (!A->boundtocpu) { 1299ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1309ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1319c1083e7SRichard Tran Mills } else { 1329c1083e7SRichard Tran Mills (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1339c1083e7SRichard Tran Mills (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1349c1083e7SRichard Tran Mills } 1359566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1369566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1379566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1399c1083e7SRichard Tran Mills if (!A->boundtocpu) { 140087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1429c1083e7SRichard Tran Mills } else { 1439c1083e7SRichard Tran Mills (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1449c1083e7SRichard Tran Mills (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1459c1083e7SRichard Tran Mills } 1469566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1479566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1489ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149bc3f50f2SPaul Mullowney 1509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1514ac6704cSBarry Smith (*B)->canuseordering = PETSC_TRUE; 1529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 1539ae82921SPaul Mullowney PetscFunctionReturn(0); 1549ae82921SPaul Mullowney } 1559ae82921SPaul Mullowney 156bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157ca45077fSPaul Mullowney { 158aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1596e111a19SKarl Rupp 160ca45077fSPaul Mullowney PetscFunctionBegin; 161ca45077fSPaul Mullowney switch (op) { 162e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 163aa372e3fSPaul Mullowney cusparsestruct->format = format; 164ca45077fSPaul Mullowney break; 165e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 166aa372e3fSPaul Mullowney cusparsestruct->format = format; 167ca45077fSPaul Mullowney break; 168ca45077fSPaul Mullowney default: 16998921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170ca45077fSPaul Mullowney } 171ca45077fSPaul Mullowney PetscFunctionReturn(0); 172ca45077fSPaul Mullowney } 1739ae82921SPaul Mullowney 174e057df02SPaul Mullowney /*@ 175e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 177aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 178e057df02SPaul Mullowney Not Collective 179e057df02SPaul Mullowney 180e057df02SPaul Mullowney Input Parameters: 1818468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 18236d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1832692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184e057df02SPaul Mullowney 185e057df02SPaul Mullowney Output Parameter: 186e057df02SPaul Mullowney 187e057df02SPaul Mullowney Level: intermediate 188e057df02SPaul Mullowney 1898468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 190e057df02SPaul Mullowney @*/ 191e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192e057df02SPaul Mullowney { 193e057df02SPaul Mullowney PetscFunctionBegin; 194e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 196e057df02SPaul Mullowney PetscFunctionReturn(0); 197e057df02SPaul Mullowney } 198e057df02SPaul Mullowney 199365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200365b711fSMark Adams { 201365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202365b711fSMark Adams 203365b711fSMark Adams PetscFunctionBegin; 204365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 205365b711fSMark Adams PetscFunctionReturn(0); 206365b711fSMark Adams } 207365b711fSMark Adams 208365b711fSMark Adams /*@ 209365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210365b711fSMark Adams 211365b711fSMark Adams Input Parameters: 212365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 213365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 214365b711fSMark Adams 215365b711fSMark Adams Output Parameter: 216365b711fSMark Adams 217365b711fSMark Adams Notes: 218365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 219365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221365b711fSMark Adams 222365b711fSMark Adams Level: intermediate 223365b711fSMark Adams 224365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 225365b711fSMark Adams @*/ 226365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227365b711fSMark Adams { 228365b711fSMark Adams PetscFunctionBegin; 229365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 231365b711fSMark Adams PetscFunctionReturn(0); 232365b711fSMark Adams } 233365b711fSMark Adams 2341a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235e6e9a74fSStefano Zampini { 236e6e9a74fSStefano Zampini PetscFunctionBegin; 2371a2c6b5cSJunchao Zhang switch (op) { 2381a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2391a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2409566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 2411a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2421a2c6b5cSJunchao Zhang break; 2431a2c6b5cSJunchao Zhang default: 2449566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 2451a2c6b5cSJunchao Zhang break; 246e6e9a74fSStefano Zampini } 247e6e9a74fSStefano Zampini PetscFunctionReturn(0); 248e6e9a74fSStefano Zampini } 249e6e9a74fSStefano Zampini 250bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251bddcd29dSMark Adams 252bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253bddcd29dSMark Adams { 254bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 256bddcd29dSMark Adams PetscBool row_identity,col_identity; 257365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258bddcd29dSMark Adams 259bddcd29dSMark Adams PetscFunctionBegin; 2609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2619566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 263bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 2649566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 2659566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 266bddcd29dSMark Adams if (row_identity && col_identity) { 267365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 268bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 269bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 270365b711fSMark Adams } 271bddcd29dSMark Adams B->ops->matsolve = NULL; 272bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 273bddcd29dSMark Adams } else { 274365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 275bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 276bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 277365b711fSMark Adams } 278bddcd29dSMark Adams B->ops->matsolve = NULL; 279bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 280bddcd29dSMark Adams } 281bddcd29dSMark Adams 282bddcd29dSMark Adams /* get the triangular factors */ 283365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 2849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 285365b711fSMark Adams } 286bddcd29dSMark Adams PetscFunctionReturn(0); 287bddcd29dSMark Adams } 288bddcd29dSMark Adams 2894416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2909ae82921SPaul Mullowney { 291e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2929ae82921SPaul Mullowney PetscBool flg; 293a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2946e111a19SKarl Rupp 2959ae82921SPaul Mullowney PetscFunctionBegin; 296*d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 2979ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 298*d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 299*d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 3009566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 301afb2bd1cSJunchao Zhang 302*d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 303*d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 3049566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 3059566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 3069566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 307afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 308*d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 309*d0609cedSBarry Smith "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 310afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 3118efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3122c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 313a435da06SStefano Zampini #else 3142c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315a435da06SStefano Zampini #endif 316*d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 317*d0609cedSBarry Smith "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 3182c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 319afb2bd1cSJunchao Zhang 320*d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 321*d0609cedSBarry Smith "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 3222c71b3e2SJacob Faibussowitsch PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 323afb2bd1cSJunchao Zhang #endif 3244c87dfd4SPaul Mullowney } 325*d0609cedSBarry Smith PetscOptionsHeadEnd(); 3269ae82921SPaul Mullowney PetscFunctionReturn(0); 3279ae82921SPaul Mullowney } 3289ae82921SPaul Mullowney 3296fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3309ae82921SPaul Mullowney { 331da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3329ae82921SPaul Mullowney 3339ae82921SPaul Mullowney PetscFunctionBegin; 3349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3359566063dSJacob Faibussowitsch PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 3369ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3379ae82921SPaul Mullowney PetscFunctionReturn(0); 3389ae82921SPaul Mullowney } 3399ae82921SPaul Mullowney 3406fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3419ae82921SPaul Mullowney { 342da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3439ae82921SPaul Mullowney 3449ae82921SPaul Mullowney PetscFunctionBegin; 3459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3469566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 3479ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3489ae82921SPaul Mullowney PetscFunctionReturn(0); 3499ae82921SPaul Mullowney } 3509ae82921SPaul Mullowney 351087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 352087f3262SPaul Mullowney { 353da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 354087f3262SPaul Mullowney 355087f3262SPaul Mullowney PetscFunctionBegin; 3569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3579566063dSJacob Faibussowitsch PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 358087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 359087f3262SPaul Mullowney PetscFunctionReturn(0); 360087f3262SPaul Mullowney } 361087f3262SPaul Mullowney 362087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 363087f3262SPaul Mullowney { 364da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 365087f3262SPaul Mullowney 366087f3262SPaul Mullowney PetscFunctionBegin; 3679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3689566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 369087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 370087f3262SPaul Mullowney PetscFunctionReturn(0); 371087f3262SPaul Mullowney } 372087f3262SPaul Mullowney 373087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3749ae82921SPaul Mullowney { 3759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3769ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3779ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3799ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3809ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3819ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3829ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 3839ae82921SPaul Mullowney 3849ae82921SPaul Mullowney PetscFunctionBegin; 385cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 386c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3879ae82921SPaul Mullowney try { 3889ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3899ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 390da79fbbcSStefano Zampini if (!loTriFactor) { 3912cbc15d9SMark PetscScalar *AALo; 3922cbc15d9SMark 3939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 3979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 3989ae82921SPaul Mullowney 3999ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 4009ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 4019ae82921SPaul Mullowney AiLo[n] = nzLower; 4029ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 4039ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4049ae82921SPaul Mullowney v = aa; 4059ae82921SPaul Mullowney vi = aj; 4069ae82921SPaul Mullowney offset = 1; 4079ae82921SPaul Mullowney rowOffset= 1; 4089ae82921SPaul Mullowney for (i=1; i<n; i++) { 4099ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 410e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4119ae82921SPaul Mullowney AiLo[i] = rowOffset; 4129ae82921SPaul Mullowney rowOffset += nz+1; 4139ae82921SPaul Mullowney 4149566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 4159566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 4169ae82921SPaul Mullowney 4179ae82921SPaul Mullowney offset += nz; 4189ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4199ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4209ae82921SPaul Mullowney offset += 1; 4219ae82921SPaul Mullowney 4229ae82921SPaul Mullowney v += nz; 4239ae82921SPaul Mullowney vi += nz; 4249ae82921SPaul Mullowney } 4252205254eSKarl Rupp 426aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4279566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 428da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 429aa372e3fSPaul Mullowney /* Create the matrix description */ 4309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 4319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 434afb2bd1cSJunchao Zhang #else 4359566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 436afb2bd1cSJunchao Zhang #endif 4379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 4389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 439aa372e3fSPaul Mullowney 440aa372e3fSPaul Mullowney /* set the operation */ 441aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 442aa372e3fSPaul Mullowney 443aa372e3fSPaul Mullowney /* set the matrix */ 444aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 445aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 446aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 447aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 448aa372e3fSPaul Mullowney 449aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 450aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 451aa372e3fSPaul Mullowney 452aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 453aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 454aa372e3fSPaul Mullowney 455aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 456aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 457aa372e3fSPaul Mullowney 458afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4599566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 4609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 4611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 463afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 464afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 465afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 4665f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 4679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 468afb2bd1cSJunchao Zhang #endif 469afb2bd1cSJunchao Zhang 470aa372e3fSPaul Mullowney /* perform the solve analysis */ 4719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 472aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 4751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 476d49cd2b7SBarry Smith loTriFactor->solveInfo, 4775f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 478d49cd2b7SBarry Smith #else 4795f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 480afb2bd1cSJunchao Zhang #endif 4819566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4829566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 483aa372e3fSPaul Mullowney 484da79fbbcSStefano Zampini /* assign the pointer */ 485aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4862cbc15d9SMark loTriFactor->AA_h = AALo; 4879566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4889566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4899566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 490da79fbbcSStefano Zampini } else { /* update values only */ 4912cbc15d9SMark if (!loTriFactor->AA_h) { 4929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 4932cbc15d9SMark } 494da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4952cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 496da79fbbcSStefano Zampini v = aa; 497da79fbbcSStefano Zampini vi = aj; 498da79fbbcSStefano Zampini offset = 1; 499da79fbbcSStefano Zampini for (i=1; i<n; i++) { 500da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 5019566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 502da79fbbcSStefano Zampini offset += nz; 5032cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 504da79fbbcSStefano Zampini offset += 1; 505da79fbbcSStefano Zampini v += nz; 506da79fbbcSStefano Zampini } 5072cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 5089566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 509da79fbbcSStefano Zampini } 5109ae82921SPaul Mullowney } catch(char *ex) { 51198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5129ae82921SPaul Mullowney } 5139ae82921SPaul Mullowney } 5149ae82921SPaul Mullowney PetscFunctionReturn(0); 5159ae82921SPaul Mullowney } 5169ae82921SPaul Mullowney 517087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5189ae82921SPaul Mullowney { 5199ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5209ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5219ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 522aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5239ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5249ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5259ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5269ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5279ae82921SPaul Mullowney 5289ae82921SPaul Mullowney PetscFunctionBegin; 529cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 530c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5319ae82921SPaul Mullowney try { 5329ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5339ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 534da79fbbcSStefano Zampini if (!upTriFactor) { 5352cbc15d9SMark PetscScalar *AAUp; 5362cbc15d9SMark 5379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 5382cbc15d9SMark 5399ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 5409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 5419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 5429ae82921SPaul Mullowney 5439ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5449ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5459ae82921SPaul Mullowney AiUp[n]=nzUpper; 5469ae82921SPaul Mullowney offset = nzUpper; 5479ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5489ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5499ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5509ae82921SPaul Mullowney 551e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5529ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5539ae82921SPaul Mullowney 554e057df02SPaul Mullowney /* decrement the offset */ 5559ae82921SPaul Mullowney offset -= (nz+1); 5569ae82921SPaul Mullowney 557e057df02SPaul Mullowney /* first, set the diagonal elements */ 5589ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 55909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5609ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5619ae82921SPaul Mullowney 5629566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 5639566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 5649ae82921SPaul Mullowney } 5652205254eSKarl Rupp 566aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 5679566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 568da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5692205254eSKarl Rupp 570aa372e3fSPaul Mullowney /* Create the matrix description */ 5719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 5729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 5731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 5749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 575afb2bd1cSJunchao Zhang #else 5769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 577afb2bd1cSJunchao Zhang #endif 5789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 5799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 580aa372e3fSPaul Mullowney 581aa372e3fSPaul Mullowney /* set the operation */ 582aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 583aa372e3fSPaul Mullowney 584aa372e3fSPaul Mullowney /* set the matrix */ 585aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 586aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 587aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 588aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 589aa372e3fSPaul Mullowney 590aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 591aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 592aa372e3fSPaul Mullowney 593aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 594aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 595aa372e3fSPaul Mullowney 596aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 597aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 598aa372e3fSPaul Mullowney 599afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6009566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 6019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 6021b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 6039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 604afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 605afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 606afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 6075f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 6089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 609afb2bd1cSJunchao Zhang #endif 610afb2bd1cSJunchao Zhang 611aa372e3fSPaul Mullowney /* perform the solve analysis */ 6129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 613aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 614aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 615d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 617d49cd2b7SBarry Smith upTriFactor->solveInfo, 6185f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 619d49cd2b7SBarry Smith #else 6205f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 621afb2bd1cSJunchao Zhang #endif 6229566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 6239566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 624aa372e3fSPaul Mullowney 625da79fbbcSStefano Zampini /* assign the pointer */ 626aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6272cbc15d9SMark upTriFactor->AA_h = AAUp; 6289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 6299566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 6309566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 631da79fbbcSStefano Zampini } else { 6322cbc15d9SMark if (!upTriFactor->AA_h) { 6339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 6342cbc15d9SMark } 635da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 636da79fbbcSStefano Zampini offset = nzUpper; 637da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 638da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 639da79fbbcSStefano Zampini 640da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 641da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 642da79fbbcSStefano Zampini 643da79fbbcSStefano Zampini /* decrement the offset */ 644da79fbbcSStefano Zampini offset -= (nz+1); 645da79fbbcSStefano Zampini 646da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6472cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6489566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 649da79fbbcSStefano Zampini } 6502cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 6519566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 652da79fbbcSStefano Zampini } 6539ae82921SPaul Mullowney } catch(char *ex) { 65498921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6559ae82921SPaul Mullowney } 6569ae82921SPaul Mullowney } 6579ae82921SPaul Mullowney PetscFunctionReturn(0); 6589ae82921SPaul Mullowney } 6599ae82921SPaul Mullowney 660087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6619ae82921SPaul Mullowney { 6629ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6639ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6649ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6659ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6669ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6679ae82921SPaul Mullowney 6689ae82921SPaul Mullowney PetscFunctionBegin; 66928b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 6709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 6719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 6722205254eSKarl Rupp 673da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 674aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6759ae82921SPaul Mullowney 676c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 677e057df02SPaul Mullowney /* lower triangular indices */ 6789566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 679da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 680da79fbbcSStefano Zampini const PetscInt *r; 681da79fbbcSStefano Zampini 6829566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r)); 683aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 684aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6859566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r)); 6869566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 687da79fbbcSStefano Zampini } 6889ae82921SPaul Mullowney 689e057df02SPaul Mullowney /* upper triangular indices */ 6909566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 691da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 692da79fbbcSStefano Zampini const PetscInt *c; 693da79fbbcSStefano Zampini 6949566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&c)); 695aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 696aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6979566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&c)); 6989566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 699da79fbbcSStefano Zampini } 7009ae82921SPaul Mullowney PetscFunctionReturn(0); 7019ae82921SPaul Mullowney } 7029ae82921SPaul Mullowney 703087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 704087f3262SPaul Mullowney { 705087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 706087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 707aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 708aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 709087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 710087f3262SPaul Mullowney PetscScalar *AAUp; 711087f3262SPaul Mullowney PetscScalar *AALo; 712087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 713087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 714087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 715087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 716087f3262SPaul Mullowney 717087f3262SPaul Mullowney PetscFunctionBegin; 718cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 719c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 720087f3262SPaul Mullowney try { 7219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 7229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 723da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 724087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 7259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 7269566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 727087f3262SPaul Mullowney 728087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 729087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 730087f3262SPaul Mullowney AiUp[n]=nzUpper; 731087f3262SPaul Mullowney offset = 0; 732087f3262SPaul Mullowney for (i=0; i<n; i++) { 733087f3262SPaul Mullowney /* set the pointers */ 734087f3262SPaul Mullowney v = aa + ai[i]; 735087f3262SPaul Mullowney vj = aj + ai[i]; 736087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 737087f3262SPaul Mullowney 738087f3262SPaul Mullowney /* first, set the diagonal elements */ 739087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 74009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 741087f3262SPaul Mullowney AiUp[i] = offset; 74209f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 743087f3262SPaul Mullowney 744087f3262SPaul Mullowney offset+=1; 745087f3262SPaul Mullowney if (nz>0) { 7469566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 7479566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 748087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 749087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 750087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 751087f3262SPaul Mullowney } 752087f3262SPaul Mullowney offset+=nz; 753087f3262SPaul Mullowney } 754087f3262SPaul Mullowney } 755087f3262SPaul Mullowney 756aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 7579566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 758da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 759087f3262SPaul Mullowney 760aa372e3fSPaul Mullowney /* Create the matrix description */ 7619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 7629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 7631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 765afb2bd1cSJunchao Zhang #else 7669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 767afb2bd1cSJunchao Zhang #endif 7689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 770087f3262SPaul Mullowney 771aa372e3fSPaul Mullowney /* set the matrix */ 772aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 773aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 774aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 775aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 776aa372e3fSPaul Mullowney 777aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 778aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 779aa372e3fSPaul Mullowney 780aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 781aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 782aa372e3fSPaul Mullowney 783aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 784aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 785aa372e3fSPaul Mullowney 786afb2bd1cSJunchao Zhang /* set the operation */ 787afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 788afb2bd1cSJunchao Zhang 789afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7909566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 7919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 7921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 794afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 795afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 796afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 7975f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 7989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 799afb2bd1cSJunchao Zhang #endif 800afb2bd1cSJunchao Zhang 801aa372e3fSPaul Mullowney /* perform the solve analysis */ 8029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 803aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 804aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 805d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 8061b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 807d49cd2b7SBarry Smith upTriFactor->solveInfo, 8085f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 809d49cd2b7SBarry Smith #else 8105f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 811afb2bd1cSJunchao Zhang #endif 8129566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8139566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 814aa372e3fSPaul Mullowney 815da79fbbcSStefano Zampini /* assign the pointer */ 816aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 817aa372e3fSPaul Mullowney 818aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8199566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 820da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 821aa372e3fSPaul Mullowney 822aa372e3fSPaul Mullowney /* Create the matrix description */ 8239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 8249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8251b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 827afb2bd1cSJunchao Zhang #else 8289566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 829afb2bd1cSJunchao Zhang #endif 8309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 832aa372e3fSPaul Mullowney 833aa372e3fSPaul Mullowney /* set the operation */ 834aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 835aa372e3fSPaul Mullowney 836aa372e3fSPaul Mullowney /* set the matrix */ 837aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 838aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 839aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 840aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 841aa372e3fSPaul Mullowney 842aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 843aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 846aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 847aa372e3fSPaul Mullowney 848aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 849aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 850aa372e3fSPaul Mullowney 851afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8529566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 8539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 8541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 8559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 856afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 857afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 858afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 8595f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 8609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 861afb2bd1cSJunchao Zhang #endif 862afb2bd1cSJunchao Zhang 863aa372e3fSPaul Mullowney /* perform the solve analysis */ 8649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 865aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 866aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 867d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 8681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 869d49cd2b7SBarry Smith loTriFactor->solveInfo, 8705f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 871d49cd2b7SBarry Smith #else 8725f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 873afb2bd1cSJunchao Zhang #endif 8749566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8759566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 876aa372e3fSPaul Mullowney 877da79fbbcSStefano Zampini /* assign the pointer */ 878aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 879087f3262SPaul Mullowney 8809566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 8819566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 8829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 883da79fbbcSStefano Zampini } else { 884da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 885da79fbbcSStefano Zampini offset = 0; 886da79fbbcSStefano Zampini for (i=0; i<n; i++) { 887da79fbbcSStefano Zampini /* set the pointers */ 888da79fbbcSStefano Zampini v = aa + ai[i]; 889da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 890da79fbbcSStefano Zampini 891da79fbbcSStefano Zampini /* first, set the diagonal elements */ 892da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 893da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 894da79fbbcSStefano Zampini 895da79fbbcSStefano Zampini offset+=1; 896da79fbbcSStefano Zampini if (nz>0) { 8979566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 898da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 899da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 900da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 901da79fbbcSStefano Zampini } 902da79fbbcSStefano Zampini offset+=nz; 903da79fbbcSStefano Zampini } 904da79fbbcSStefano Zampini } 90528b400f6SJacob Faibussowitsch PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 90628b400f6SJacob Faibussowitsch PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 908da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 9099566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 910da79fbbcSStefano Zampini } 9119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 9129566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 913087f3262SPaul Mullowney } catch(char *ex) { 91498921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 915087f3262SPaul Mullowney } 916087f3262SPaul Mullowney } 917087f3262SPaul Mullowney PetscFunctionReturn(0); 918087f3262SPaul Mullowney } 919087f3262SPaul Mullowney 920087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9219ae82921SPaul Mullowney { 922087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 923087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 924087f3262SPaul Mullowney IS ip = a->row; 925087f3262SPaul Mullowney PetscBool perm_identity; 926087f3262SPaul Mullowney PetscInt n = A->rmap->n; 927087f3262SPaul Mullowney 928087f3262SPaul Mullowney PetscFunctionBegin; 92928b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 9309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 931da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 932aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 933aa372e3fSPaul Mullowney 934da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 935da79fbbcSStefano Zampini 936087f3262SPaul Mullowney /* lower triangular indices */ 9379566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 938087f3262SPaul Mullowney if (!perm_identity) { 9394e4bbfaaSStefano Zampini IS iip; 940da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9414e4bbfaaSStefano Zampini 9429566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 9439566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip,&irip)); 9449566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip,&rip)); 945aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 946aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 947aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9484e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9499566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip,&irip)); 9509566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 9519566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip,&rip)); 9529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 953da79fbbcSStefano Zampini } 954087f3262SPaul Mullowney PetscFunctionReturn(0); 955087f3262SPaul Mullowney } 956087f3262SPaul Mullowney 957087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 958087f3262SPaul Mullowney { 959087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 960087f3262SPaul Mullowney IS ip = b->row; 961087f3262SPaul Mullowney PetscBool perm_identity; 962087f3262SPaul Mullowney 963087f3262SPaul Mullowney PetscFunctionBegin; 9649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 9659566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 966ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 967087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 9689566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 969087f3262SPaul Mullowney if (perm_identity) { 970087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 971087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9724e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9734e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 974087f3262SPaul Mullowney } else { 975087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 976087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9774e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9784e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 979087f3262SPaul Mullowney } 980087f3262SPaul Mullowney 981087f3262SPaul Mullowney /* get the triangular factors */ 9829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 983087f3262SPaul Mullowney PetscFunctionReturn(0); 984087f3262SPaul Mullowney } 9859ae82921SPaul Mullowney 986b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 987bda325fcSPaul Mullowney { 988bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 989aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 990aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 991da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 992da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 993aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 994aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 995aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 996aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 997b175d8bbSPaul Mullowney 998bda325fcSPaul Mullowney PetscFunctionBegin; 999aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10009566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 1001da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002aa372e3fSPaul Mullowney 1003aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1004aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1005aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009aa372e3fSPaul Mullowney 1010aa372e3fSPaul Mullowney /* Create the matrix description */ 10119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1016aa372e3fSPaul Mullowney 1017aa372e3fSPaul Mullowney /* set the operation */ 1018aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019aa372e3fSPaul Mullowney 1020aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1021aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1022afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028aa372e3fSPaul Mullowney 1029aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 10319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1034afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1035afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1036afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1037afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 10395f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 10409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1041afb2bd1cSJunchao Zhang #endif 1042afb2bd1cSJunchao Zhang 10439566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 10449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1047aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1048aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1049aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1050afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 10535f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1054afb2bd1cSJunchao Zhang #else 1055afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 10565f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1057afb2bd1cSJunchao Zhang #endif 10589566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10599566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1060aa372e3fSPaul Mullowney 1061afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 10629566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 10639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 10641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 10659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1066afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1067afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 10695f80ce2aSJacob Faibussowitsch &loTriFactorT->solveBufferSize)); 10709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1071afb2bd1cSJunchao Zhang #endif 1072afb2bd1cSJunchao Zhang 1073afb2bd1cSJunchao Zhang /* perform the solve analysis */ 10749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1075afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 10781b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1079d49cd2b7SBarry Smith loTriFactorT->solveInfo, 10805f80ce2aSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1081d49cd2b7SBarry Smith #else 10825f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 1083afb2bd1cSJunchao Zhang #endif 10849566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10859566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1086aa372e3fSPaul Mullowney 1087da79fbbcSStefano Zampini /* assign the pointer */ 1088aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1089aa372e3fSPaul Mullowney 1090aa372e3fSPaul Mullowney /*********************************************/ 1091aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1092aa372e3fSPaul Mullowney /*********************************************/ 1093aa372e3fSPaul Mullowney 1094aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 10959566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1096da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1097aa372e3fSPaul Mullowney 1098aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1099aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1100aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1101aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1102aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1103aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1104aa372e3fSPaul Mullowney 1105aa372e3fSPaul Mullowney /* Create the matrix description */ 11069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1111aa372e3fSPaul Mullowney 1112aa372e3fSPaul Mullowney /* set the operation */ 1113aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1114aa372e3fSPaul Mullowney 1115aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1116aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1117afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1118afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1119aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1120afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1121afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1122afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1123aa372e3fSPaul Mullowney 1124aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1125afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 11269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1127afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1128afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1129afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1130afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1131afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1132afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1133afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 11345f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1136afb2bd1cSJunchao Zhang #endif 1137afb2bd1cSJunchao Zhang 11389566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 11399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1140aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1142aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1143aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1144aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1145afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1146afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1147afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 11485f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1149afb2bd1cSJunchao Zhang #else 1150afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11515f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1152afb2bd1cSJunchao Zhang #endif 1153d49cd2b7SBarry Smith 11549566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11559566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1156aa372e3fSPaul Mullowney 1157afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11589566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 11599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 11601b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 11619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1162afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1163afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 11655f80ce2aSJacob Faibussowitsch &upTriFactorT->solveBufferSize)); 11669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1167afb2bd1cSJunchao Zhang #endif 1168afb2bd1cSJunchao Zhang 1169afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11705f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 11719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 11751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1176d49cd2b7SBarry Smith upTriFactorT->solveInfo, 11775f80ce2aSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1178d49cd2b7SBarry Smith #else 11795f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1180afb2bd1cSJunchao Zhang #endif 1181d49cd2b7SBarry Smith 11829566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11839566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1184aa372e3fSPaul Mullowney 1185da79fbbcSStefano Zampini /* assign the pointer */ 1186aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1187bda325fcSPaul Mullowney PetscFunctionReturn(0); 1188bda325fcSPaul Mullowney } 1189bda325fcSPaul Mullowney 1190a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1191a49f1ed0SStefano Zampini { 1192a49f1ed0SStefano Zampini __host__ __device__ 1193a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1194a49f1ed0SStefano Zampini { 1195a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1196a49f1ed0SStefano Zampini } 1197a49f1ed0SStefano Zampini }; 1198a49f1ed0SStefano Zampini 11993606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1200bda325fcSPaul Mullowney { 1201aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1202a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1203bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1204bda325fcSPaul Mullowney cusparseStatus_t stat; 1205aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1206b175d8bbSPaul Mullowney 1207bda325fcSPaul Mullowney PetscFunctionBegin; 12089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1209a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 121028b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1211a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 121208401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 12131a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 12149566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 12159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1216a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 12179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1218a49f1ed0SStefano Zampini } 1219a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1220aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1222aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1225aa372e3fSPaul Mullowney 1226b06137fdSPaul Mullowney /* set alpha and beta */ 12279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 12289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 12299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 12309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 12319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 12329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1233b06137fdSPaul Mullowney 1234aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1235aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1236a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1237554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1238554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1239aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1240a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1241aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1242aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1243a3fdcf43SKarl Rupp 1244039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 124581902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1246afb2bd1cSJunchao Zhang 1247afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 12483606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1249afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1250afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1251afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1252afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1253afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 12549566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 12553606e59fSJunchao Zhang #else 12563606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12573606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12583606e59fSJunchao Zhang 12593606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12603606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12613606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12623606e59fSJunchao Zhang */ 12633606e59fSJunchao Zhang if (matrixT->num_entries) { 12643606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 12653606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 12663606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 12673606e59fSJunchao Zhang matrixT->values->data().get(), 12683606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 12699566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 12703606e59fSJunchao Zhang 12713606e59fSJunchao Zhang } else { 12723606e59fSJunchao Zhang matstructT->matDescr = NULL; 12733606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12743606e59fSJunchao Zhang } 12753606e59fSJunchao Zhang #endif 1276afb2bd1cSJunchao Zhang #endif 1277aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1278afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1279afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1280afb2bd1cSJunchao Zhang #else 1281aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 128251c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 128351c6d536SStefano Zampini /* First convert HYB to CSR */ 1284aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1285aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1286aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1287aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1288aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1289aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1290aa372e3fSPaul Mullowney 1291aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1292aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1293aa372e3fSPaul Mullowney temp->values->data().get(), 1294aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 12959566063dSJacob Faibussowitsch temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1296aa372e3fSPaul Mullowney 1297aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1298aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1299aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1300aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1301aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1302aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1303aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1304aa372e3fSPaul Mullowney 1305aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1306aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1307aa372e3fSPaul Mullowney temp->values->data().get(), 1308aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1309aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1310aa372e3fSPaul Mullowney tempT->values->data().get(), 1311aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1312aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 13139566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1314aa372e3fSPaul Mullowney 1315aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1316aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1318aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1319aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1320aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1321aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1322aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1323aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 13249566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1325aa372e3fSPaul Mullowney 1326aa372e3fSPaul Mullowney /* assign the pointer */ 1327aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13281a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1329aa372e3fSPaul Mullowney /* delete temporaries */ 1330aa372e3fSPaul Mullowney if (tempT) { 1331aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1332aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1333aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1334aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1335087f3262SPaul Mullowney } 1336aa372e3fSPaul Mullowney if (temp) { 1337aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1338aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1339aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1340aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1341aa372e3fSPaul Mullowney } 1342afb2bd1cSJunchao Zhang #endif 1343aa372e3fSPaul Mullowney } 1344a49f1ed0SStefano Zampini } 1345a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1346a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1347a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 134828b400f6SJacob Faibussowitsch PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 134928b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 135028b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 135128b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 135228b400f6SJacob Faibussowitsch PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 135328b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 135428b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 135528b400f6SJacob Faibussowitsch PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1356a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1357a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1358a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 13599566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1360a49f1ed0SStefano Zampini } 1361a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1362a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1363a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1364a49f1ed0SStefano Zampini 1365a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1366a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1367a49f1ed0SStefano Zampini void *csr2cscBuffer; 1368a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1369a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1370a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1371a49f1ed0SStefano Zampini matrix->values->data().get(), 1372a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1373a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1374a49f1ed0SStefano Zampini matrixT->values->data().get(), 1375a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1376a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13779566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 13789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1379a49f1ed0SStefano Zampini #endif 1380a49f1ed0SStefano Zampini 13811a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13821a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13831a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13841a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13851a2c6b5cSJunchao Zhang 13861a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13871a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13881a2c6b5cSJunchao Zhang */ 13891a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 13901a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 13911a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 13921a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 13931a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1394a49f1ed0SStefano Zampini matrixT->values->data().get(), 1395a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1396a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1397a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13989566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1399a49f1ed0SStefano Zampini #else 1400a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 14019566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1402a49f1ed0SStefano Zampini #endif 14031a2c6b5cSJunchao Zhang } else { 14041a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 14051a2c6b5cSJunchao Zhang } 14061a2c6b5cSJunchao Zhang 1407a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1408a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1409a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 14109566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1411a49f1ed0SStefano Zampini #endif 1412a49f1ed0SStefano Zampini } 1413a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1414a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1415a49f1ed0SStefano Zampini matrixT->values->begin())); 1416a49f1ed0SStefano Zampini } 14179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14189566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1419213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1420213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1421aa372e3fSPaul Mullowney /* assign the pointer */ 1422aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14231a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1424bda325fcSPaul Mullowney PetscFunctionReturn(0); 1425bda325fcSPaul Mullowney } 1426bda325fcSPaul Mullowney 1427a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14286fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1429bda325fcSPaul Mullowney { 1430c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1431465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1432465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1433465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1434465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1435bda325fcSPaul Mullowney cusparseStatus_t stat; 1436bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1437aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1438aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1439aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1440bda325fcSPaul Mullowney 1441bda325fcSPaul Mullowney PetscFunctionBegin; 1442aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1443aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 14449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1445aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1446aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1447bda325fcSPaul Mullowney } 1448bda325fcSPaul Mullowney 1449bda325fcSPaul Mullowney /* Get the GPU pointers */ 14509566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14519566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1452c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1453c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1454bda325fcSPaul Mullowney 14559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1456aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1457a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1458c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1459c41cb2e2SAlejandro Lamas Daviña xGPU); 1460aa372e3fSPaul Mullowney 1461aa372e3fSPaul Mullowney /* First, solve U */ 1462aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1463afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1465afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1466afb2bd1cSJunchao Zhang #endif 1467afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1468aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1469aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1470aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1471aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1472d49cd2b7SBarry Smith xarray, 14731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1474d49cd2b7SBarry Smith tempGPU->data().get(), 14759566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1476d49cd2b7SBarry Smith #else 14779566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1478afb2bd1cSJunchao Zhang #endif 1479aa372e3fSPaul Mullowney 1480aa372e3fSPaul Mullowney /* Then, solve L */ 1481aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1482afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14831b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1485afb2bd1cSJunchao Zhang #endif 1486afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1487aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1488aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1489aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1490aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1491d49cd2b7SBarry Smith tempGPU->data().get(), 14921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1493d49cd2b7SBarry Smith xarray, 14949566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1495d49cd2b7SBarry Smith #else 14969566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1497afb2bd1cSJunchao Zhang #endif 1498aa372e3fSPaul Mullowney 1499aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1500a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1501c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1502aa372e3fSPaul Mullowney tempGPU->begin()); 1503aa372e3fSPaul Mullowney 1504aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1505a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1506bda325fcSPaul Mullowney 1507bda325fcSPaul Mullowney /* restore */ 15089566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15099566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1512bda325fcSPaul Mullowney PetscFunctionReturn(0); 1513bda325fcSPaul Mullowney } 1514bda325fcSPaul Mullowney 15156fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1516bda325fcSPaul Mullowney { 1517465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1518465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1519bda325fcSPaul Mullowney cusparseStatus_t stat; 1520bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1521aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1522aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1523aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1524bda325fcSPaul Mullowney 1525bda325fcSPaul Mullowney PetscFunctionBegin; 1526aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1527aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1529aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1530aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1531bda325fcSPaul Mullowney } 1532bda325fcSPaul Mullowney 1533bda325fcSPaul Mullowney /* Get the GPU pointers */ 15349566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 15359566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1536bda325fcSPaul Mullowney 15379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1538aa372e3fSPaul Mullowney /* First, solve U */ 1539aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1540afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1542afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1543afb2bd1cSJunchao Zhang #endif 1544afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1545aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1546aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1547aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1548aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1549d49cd2b7SBarry Smith barray, 15501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1551d49cd2b7SBarry Smith tempGPU->data().get(), 15529566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1553d49cd2b7SBarry Smith #else 15549566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1555afb2bd1cSJunchao Zhang #endif 1556aa372e3fSPaul Mullowney 1557aa372e3fSPaul Mullowney /* Then, solve L */ 1558aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1559afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15601b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1561afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1562afb2bd1cSJunchao Zhang #endif 1563afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1564aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1565aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1566aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1567aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1568d49cd2b7SBarry Smith tempGPU->data().get(), 15691b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1570d49cd2b7SBarry Smith xarray, 15719566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1572d49cd2b7SBarry Smith #else 15739566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1574afb2bd1cSJunchao Zhang #endif 1575bda325fcSPaul Mullowney 1576bda325fcSPaul Mullowney /* restore */ 15779566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15789566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1581bda325fcSPaul Mullowney PetscFunctionReturn(0); 1582bda325fcSPaul Mullowney } 1583bda325fcSPaul Mullowney 15846fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15859ae82921SPaul Mullowney { 1586465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1587465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1588465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1589465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15909ae82921SPaul Mullowney cusparseStatus_t stat; 15919ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 15959ae82921SPaul Mullowney 15969ae82921SPaul Mullowney PetscFunctionBegin; 1597ebc8f436SDominic Meiser 1598e057df02SPaul Mullowney /* Get the GPU pointers */ 15999566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 16009566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1601c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1602c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16039ae82921SPaul Mullowney 16049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1605aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1606a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1607c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 16084e4bbfaaSStefano Zampini tempGPU->begin()); 1609aa372e3fSPaul Mullowney 1610aa372e3fSPaul Mullowney /* Next, solve L */ 1611aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1612afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1614afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1615afb2bd1cSJunchao Zhang #endif 1616afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1617aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1618aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1619aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1620aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1621d49cd2b7SBarry Smith tempGPU->data().get(), 16221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623d49cd2b7SBarry Smith xarray, 16249566063dSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1625d49cd2b7SBarry Smith #else 16269566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1627afb2bd1cSJunchao Zhang #endif 1628aa372e3fSPaul Mullowney 1629aa372e3fSPaul Mullowney /* Then, solve U */ 1630aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1631afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1633afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1634afb2bd1cSJunchao Zhang #endif 1635afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1636aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1637aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1638aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1639d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 16401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1641d49cd2b7SBarry Smith tempGPU->data().get(), 16429566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1643d49cd2b7SBarry Smith #else 16449566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1645afb2bd1cSJunchao Zhang #endif 1646d49cd2b7SBarry Smith 16474e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1648a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16494e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16504e4bbfaaSStefano Zampini xGPU); 16519ae82921SPaul Mullowney 16529566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 16539566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 16549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 16569ae82921SPaul Mullowney PetscFunctionReturn(0); 16579ae82921SPaul Mullowney } 16589ae82921SPaul Mullowney 16596fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16609ae82921SPaul Mullowney { 1661465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1662465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16639ae82921SPaul Mullowney cusparseStatus_t stat; 16649ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1665aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1666aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1667aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 16689ae82921SPaul Mullowney 16699ae82921SPaul Mullowney PetscFunctionBegin; 1670e057df02SPaul Mullowney /* Get the GPU pointers */ 16719566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 16729566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 16739ae82921SPaul Mullowney 16749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1675aa372e3fSPaul Mullowney /* First, solve L */ 1676aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1677afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16781b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1680afb2bd1cSJunchao Zhang #endif 1681afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1682aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1683aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1684aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1685aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1686d49cd2b7SBarry Smith barray, 16871b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1688d49cd2b7SBarry Smith tempGPU->data().get(), 16899566063dSJacob Faibussowitsch loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1690d49cd2b7SBarry Smith #else 16919566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1692afb2bd1cSJunchao Zhang #endif 1693d49cd2b7SBarry Smith 1694aa372e3fSPaul Mullowney /* Next, solve U */ 1695aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1696afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16971b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1699afb2bd1cSJunchao Zhang #endif 1700afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1701aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1702aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1703aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1704aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1705d49cd2b7SBarry Smith tempGPU->data().get(), 17061b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1707d49cd2b7SBarry Smith xarray, 17089566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1709d49cd2b7SBarry Smith #else 17109566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1711afb2bd1cSJunchao Zhang #endif 17129ae82921SPaul Mullowney 17139566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 17149566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 17159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 17179ae82921SPaul Mullowney PetscFunctionReturn(0); 17189ae82921SPaul Mullowney } 17199ae82921SPaul Mullowney 17207e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17217e8381f9SStefano Zampini { 17227e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17237e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17247e8381f9SStefano Zampini 17257e8381f9SStefano Zampini PetscFunctionBegin; 17267e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17277e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17287e8381f9SStefano Zampini 17299566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 17319566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 17329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 17339566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17347e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17357e8381f9SStefano Zampini } 17367e8381f9SStefano Zampini PetscFunctionReturn(0); 17377e8381f9SStefano Zampini } 17387e8381f9SStefano Zampini 17397e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17407e8381f9SStefano Zampini { 17417e8381f9SStefano Zampini PetscFunctionBegin; 17429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 174367a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 174467a45760SJunchao Zhang PetscFunctionReturn(0); 174567a45760SJunchao Zhang } 174667a45760SJunchao Zhang 174767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 174867a45760SJunchao Zhang { 174967a45760SJunchao Zhang PetscFunctionBegin; 17507e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 175167a45760SJunchao Zhang *array = NULL; 175267a45760SJunchao Zhang PetscFunctionReturn(0); 175367a45760SJunchao Zhang } 175467a45760SJunchao Zhang 175567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 175667a45760SJunchao Zhang { 175767a45760SJunchao Zhang PetscFunctionBegin; 17589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 175967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 176067a45760SJunchao Zhang PetscFunctionReturn(0); 176167a45760SJunchao Zhang } 176267a45760SJunchao Zhang 176367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 176467a45760SJunchao Zhang { 176567a45760SJunchao Zhang PetscFunctionBegin; 176667a45760SJunchao Zhang *array = NULL; 176767a45760SJunchao Zhang PetscFunctionReturn(0); 176867a45760SJunchao Zhang } 176967a45760SJunchao Zhang 177067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 177167a45760SJunchao Zhang { 177267a45760SJunchao Zhang PetscFunctionBegin; 177367a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 177467a45760SJunchao Zhang PetscFunctionReturn(0); 177567a45760SJunchao Zhang } 177667a45760SJunchao Zhang 177767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 177867a45760SJunchao Zhang { 177967a45760SJunchao Zhang PetscFunctionBegin; 178067a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 178167a45760SJunchao Zhang *array = NULL; 17827e8381f9SStefano Zampini PetscFunctionReturn(0); 17837e8381f9SStefano Zampini } 17847e8381f9SStefano Zampini 17857ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 17867ee59b9bSJunchao Zhang { 17877ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 17887ee59b9bSJunchao Zhang CsrMatrix *matrix; 17897ee59b9bSJunchao Zhang 17907ee59b9bSJunchao Zhang PetscFunctionBegin; 17917ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 17927ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 17937ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 17947ee59b9bSJunchao Zhang PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 17957ee59b9bSJunchao Zhang matrix = (CsrMatrix*)cusp->mat->mat; 17967ee59b9bSJunchao Zhang 17977ee59b9bSJunchao Zhang if (i) { 17987ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 17997ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 18007ee59b9bSJunchao Zhang #else 18017ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 18027ee59b9bSJunchao Zhang #endif 18037ee59b9bSJunchao Zhang } 18047ee59b9bSJunchao Zhang if (j) { 18057ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 18067ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 18077ee59b9bSJunchao Zhang #else 18087ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 18097ee59b9bSJunchao Zhang #endif 18107ee59b9bSJunchao Zhang } 18117ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 18127ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 18137ee59b9bSJunchao Zhang PetscFunctionReturn(0); 18147ee59b9bSJunchao Zhang } 18157ee59b9bSJunchao Zhang 1816042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 18179ae82921SPaul Mullowney { 1818aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 18197c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 18209ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1821213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1822aa372e3fSPaul Mullowney cusparseStatus_t stat; 1823abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 18249ae82921SPaul Mullowney 18259ae82921SPaul Mullowney PetscFunctionBegin; 182628b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1827c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1828a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1829a49f1ed0SStefano Zampini CsrMatrix *matrix; 1830afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 183185ba7357SStefano Zampini 183208401ef6SPierre Jolivet PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 18339566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1834afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 18359566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 18369566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 18379566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 183934d6c7a5SJose E. Roman } else { 1840abb89eb1SStefano Zampini PetscInt nnz; 18419566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 18439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 18447c700b8dSJunchao Zhang delete cusparsestruct->workVector; 184581902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1846a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1847a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18489ae82921SPaul Mullowney try { 18499ae82921SPaul Mullowney if (a->compressedrow.use) { 18509ae82921SPaul Mullowney m = a->compressedrow.nrows; 18519ae82921SPaul Mullowney ii = a->compressedrow.i; 18529ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 18539ae82921SPaul Mullowney } else { 1854213423ffSJunchao Zhang m = A->rmap->n; 1855213423ffSJunchao Zhang ii = a->i; 1856e6e9a74fSStefano Zampini ridx = NULL; 18579ae82921SPaul Mullowney } 185808401ef6SPierre Jolivet PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1859abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1860abb89eb1SStefano Zampini else nnz = a->nz; 186108401ef6SPierre Jolivet PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 18629ae82921SPaul Mullowney 186385ba7357SStefano Zampini /* create cusparse matrix */ 1864abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1865aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 18669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 18679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 18689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 18699ae82921SPaul Mullowney 18709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 18719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 18729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 18739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1877b06137fdSPaul Mullowney 1878aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1879aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1880aa372e3fSPaul Mullowney /* set the matrix */ 1881afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1882afb2bd1cSJunchao Zhang mat->num_rows = m; 1883afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1884abb89eb1SStefano Zampini mat->num_entries = nnz; 1885afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1886afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18879ae82921SPaul Mullowney 1888abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1889abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1890aa372e3fSPaul Mullowney 1891abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1892abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1893aa372e3fSPaul Mullowney 1894aa372e3fSPaul Mullowney /* assign the pointer */ 1895afb2bd1cSJunchao Zhang matstruct->mat = mat; 1896afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1897afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1898afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1899afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1900afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1901afb2bd1cSJunchao Zhang mat->values->data().get(), 1902afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 19039566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1904afb2bd1cSJunchao Zhang } 1905afb2bd1cSJunchao Zhang #endif 1906aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1907afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1908afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1909afb2bd1cSJunchao Zhang #else 1910afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1911afb2bd1cSJunchao Zhang mat->num_rows = m; 1912afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1913abb89eb1SStefano Zampini mat->num_entries = nnz; 1914afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1915afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1916aa372e3fSPaul Mullowney 1917abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1918abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1919aa372e3fSPaul Mullowney 1920abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1921abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1922aa372e3fSPaul Mullowney 1923aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 19249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1925aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1926aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1927afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1928afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1929afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1930afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 19319566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1932aa372e3fSPaul Mullowney /* assign the pointer */ 1933aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1934aa372e3fSPaul Mullowney 1935afb2bd1cSJunchao Zhang if (mat) { 1936afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1937afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1938afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1939afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1940087f3262SPaul Mullowney } 1941afb2bd1cSJunchao Zhang #endif 1942087f3262SPaul Mullowney } 1943ca45077fSPaul Mullowney 1944aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1945213423ffSJunchao Zhang if (a->compressedrow.use) { 1946213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1947aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1948aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1949213423ffSJunchao Zhang tmp = m; 1950213423ffSJunchao Zhang } else { 1951213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1952213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1953213423ffSJunchao Zhang tmp = 0; 1954213423ffSJunchao Zhang } 19559566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1956aa372e3fSPaul Mullowney 1957aa372e3fSPaul Mullowney /* assign the pointer */ 1958aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 19599ae82921SPaul Mullowney } catch(char *ex) { 196098921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 19619ae82921SPaul Mullowney } 19629566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 19639566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 196434d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 196534d6c7a5SJose E. Roman } 1966abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 19679ae82921SPaul Mullowney } 19689ae82921SPaul Mullowney PetscFunctionReturn(0); 19699ae82921SPaul Mullowney } 19709ae82921SPaul Mullowney 1971c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1972aa372e3fSPaul Mullowney { 1973aa372e3fSPaul Mullowney template <typename Tuple> 1974aa372e3fSPaul Mullowney __host__ __device__ 1975aa372e3fSPaul Mullowney void operator()(Tuple t) 1976aa372e3fSPaul Mullowney { 1977aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1978aa372e3fSPaul Mullowney } 1979aa372e3fSPaul Mullowney }; 1980aa372e3fSPaul Mullowney 19817e8381f9SStefano Zampini struct VecCUDAEquals 19827e8381f9SStefano Zampini { 19837e8381f9SStefano Zampini template <typename Tuple> 19847e8381f9SStefano Zampini __host__ __device__ 19857e8381f9SStefano Zampini void operator()(Tuple t) 19867e8381f9SStefano Zampini { 19877e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19887e8381f9SStefano Zampini } 19897e8381f9SStefano Zampini }; 19907e8381f9SStefano Zampini 1991e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1992e6e9a74fSStefano Zampini { 1993e6e9a74fSStefano Zampini template <typename Tuple> 1994e6e9a74fSStefano Zampini __host__ __device__ 1995e6e9a74fSStefano Zampini void operator()(Tuple t) 1996e6e9a74fSStefano Zampini { 1997e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1998e6e9a74fSStefano Zampini } 1999e6e9a74fSStefano Zampini }; 2000e6e9a74fSStefano Zampini 2001afb2bd1cSJunchao Zhang struct MatMatCusparse { 2002ccdfe979SStefano Zampini PetscBool cisdense; 2003ccdfe979SStefano Zampini PetscScalar *Bt; 2004ccdfe979SStefano Zampini Mat X; 2005fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2006fcdce8c4SStefano Zampini PetscLogDouble flops; 2007fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2008b4285af6SJunchao Zhang 2009afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2010fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2011afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2012afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2013afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2014afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2015b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2016b4285af6SJunchao Zhang void *dBuffer4; 2017b4285af6SJunchao Zhang void *dBuffer5; 2018b4285af6SJunchao Zhang #endif 2019fcdce8c4SStefano Zampini size_t mmBufferSize; 2020fcdce8c4SStefano Zampini void *mmBuffer; 2021fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2022fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2023afb2bd1cSJunchao Zhang #endif 2024afb2bd1cSJunchao Zhang }; 2025ccdfe979SStefano Zampini 2026ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2027ccdfe979SStefano Zampini { 2028ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2029ccdfe979SStefano Zampini 2030ccdfe979SStefano Zampini PetscFunctionBegin; 20319566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2032fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2033afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20349566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 20359566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 20369566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 20379566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2038b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 20399566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 20409566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2041b4285af6SJunchao Zhang #endif 20429566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 20439566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2044afb2bd1cSJunchao Zhang #endif 20459566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 20469566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2047ccdfe979SStefano Zampini PetscFunctionReturn(0); 2048ccdfe979SStefano Zampini } 2049ccdfe979SStefano Zampini 2050ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2051ccdfe979SStefano Zampini 2052ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2053ccdfe979SStefano Zampini { 2054ccdfe979SStefano Zampini Mat_Product *product = C->product; 2055ccdfe979SStefano Zampini Mat A,B; 2056afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2057ccdfe979SStefano Zampini PetscBool flg,biscuda; 2058ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2059ccdfe979SStefano Zampini cusparseStatus_t stat; 2060ccdfe979SStefano Zampini cusparseOperation_t opA; 2061ccdfe979SStefano Zampini const PetscScalar *barray; 2062ccdfe979SStefano Zampini PetscScalar *carray; 2063ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2064ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2065ccdfe979SStefano Zampini CsrMatrix *csrmat; 2066ccdfe979SStefano Zampini 2067ccdfe979SStefano Zampini PetscFunctionBegin; 2068ccdfe979SStefano Zampini MatCheckProduct(C,1); 206928b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2070ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2071ccdfe979SStefano Zampini A = product->A; 2072ccdfe979SStefano Zampini B = product->B; 20739566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 207428b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2075ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2076ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 207728b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 20789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2079ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2080ccdfe979SStefano Zampini switch (product->type) { 2081ccdfe979SStefano Zampini case MATPRODUCT_AB: 2082ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2083ccdfe979SStefano Zampini mat = cusp->mat; 2084ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2085ccdfe979SStefano Zampini m = A->rmap->n; 2086ccdfe979SStefano Zampini n = B->cmap->n; 2087ccdfe979SStefano Zampini break; 2088ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20891a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2090e6e9a74fSStefano Zampini mat = cusp->mat; 2091e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2092e6e9a74fSStefano Zampini } else { 20939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2094ccdfe979SStefano Zampini mat = cusp->matTranspose; 2095ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2096e6e9a74fSStefano Zampini } 2097ccdfe979SStefano Zampini m = A->cmap->n; 2098ccdfe979SStefano Zampini n = B->cmap->n; 2099ccdfe979SStefano Zampini break; 2100ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2101ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2102ccdfe979SStefano Zampini mat = cusp->mat; 2103ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2104ccdfe979SStefano Zampini m = A->rmap->n; 2105ccdfe979SStefano Zampini n = B->rmap->n; 2106ccdfe979SStefano Zampini break; 2107ccdfe979SStefano Zampini default: 210898921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2109ccdfe979SStefano Zampini } 211028b400f6SJacob Faibussowitsch PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2111ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2112ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 21139566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 21149566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 21159566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2116afb2bd1cSJunchao Zhang 21179566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B,&blda)); 2118c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 21199566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 21209566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2121c8378d12SStefano Zampini } else { 21229566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 21239566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C,&clda)); 2124c8378d12SStefano Zampini } 2125c8378d12SStefano Zampini 21269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2127afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2128afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2129a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2130afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2131fcdce8c4SStefano Zampini size_t mmBufferSize; 21329566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2133afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 21349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2135afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2136afb2bd1cSJunchao Zhang } 2137c8378d12SStefano Zampini 21389566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2139afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 21409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2141afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2142afb2bd1cSJunchao Zhang } 2143afb2bd1cSJunchao Zhang 2144afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2145afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2146afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2147afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2148afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2149afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 21509566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2151afb2bd1cSJunchao Zhang } 2152afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2153afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2154afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 21559566063dSJacob Faibussowitsch cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2156fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 21579566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 21589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2159fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2160fcdce8c4SStefano Zampini } 2161afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2162afb2bd1cSJunchao Zhang } else { 2163afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 21649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 21659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 21669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2167afb2bd1cSJunchao Zhang } 2168afb2bd1cSJunchao Zhang 2169afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2170afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2171afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2172afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 21739566063dSJacob Faibussowitsch cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2174afb2bd1cSJunchao Zhang #else 2175afb2bd1cSJunchao Zhang PetscInt k; 2176afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2177ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2178ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2179ccdfe979SStefano Zampini cublasStatus_t cerr; 2180ccdfe979SStefano Zampini 21819566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2182ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2183ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2184ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2185ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 21869566063dSJacob Faibussowitsch mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2187ccdfe979SStefano Zampini blda = B->cmap->n; 2188afb2bd1cSJunchao Zhang k = B->cmap->n; 2189afb2bd1cSJunchao Zhang } else { 2190afb2bd1cSJunchao Zhang k = B->rmap->n; 2191ccdfe979SStefano Zampini } 2192ccdfe979SStefano Zampini 2193afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2194ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2195afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2196ccdfe979SStefano Zampini csrmat->values->data().get(), 2197ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2198ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2199ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 22009566063dSJacob Faibussowitsch carray,clda);PetscCallCUSPARSE(stat); 2201afb2bd1cSJunchao Zhang #endif 22029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 22039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 22049566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2205ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 22069566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22079566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2208ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 22099566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22109566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2211ccdfe979SStefano Zampini } else { 22129566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2213ccdfe979SStefano Zampini } 2214ccdfe979SStefano Zampini if (mmdata->cisdense) { 22159566063dSJacob Faibussowitsch PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2216ccdfe979SStefano Zampini } 2217ccdfe979SStefano Zampini if (!biscuda) { 22189566063dSJacob Faibussowitsch PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2219ccdfe979SStefano Zampini } 2220ccdfe979SStefano Zampini PetscFunctionReturn(0); 2221ccdfe979SStefano Zampini } 2222ccdfe979SStefano Zampini 2223ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2224ccdfe979SStefano Zampini { 2225ccdfe979SStefano Zampini Mat_Product *product = C->product; 2226ccdfe979SStefano Zampini Mat A,B; 2227ccdfe979SStefano Zampini PetscInt m,n; 2228ccdfe979SStefano Zampini PetscBool cisdense,flg; 2229ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2230ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2231ccdfe979SStefano Zampini 2232ccdfe979SStefano Zampini PetscFunctionBegin; 2233ccdfe979SStefano Zampini MatCheckProduct(C,1); 223428b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2235ccdfe979SStefano Zampini A = product->A; 2236ccdfe979SStefano Zampini B = product->B; 22379566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 223828b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2239ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 224008401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2241ccdfe979SStefano Zampini switch (product->type) { 2242ccdfe979SStefano Zampini case MATPRODUCT_AB: 2243ccdfe979SStefano Zampini m = A->rmap->n; 2244ccdfe979SStefano Zampini n = B->cmap->n; 2245ccdfe979SStefano Zampini break; 2246ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2247ccdfe979SStefano Zampini m = A->cmap->n; 2248ccdfe979SStefano Zampini n = B->cmap->n; 2249ccdfe979SStefano Zampini break; 2250ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2251ccdfe979SStefano Zampini m = A->rmap->n; 2252ccdfe979SStefano Zampini n = B->rmap->n; 2253ccdfe979SStefano Zampini break; 2254ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2255ccdfe979SStefano Zampini m = B->cmap->n; 2256ccdfe979SStefano Zampini n = B->cmap->n; 2257ccdfe979SStefano Zampini break; 2258ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2259ccdfe979SStefano Zampini m = B->rmap->n; 2260ccdfe979SStefano Zampini n = B->rmap->n; 2261ccdfe979SStefano Zampini break; 2262ccdfe979SStefano Zampini default: 226398921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2264ccdfe979SStefano Zampini } 22659566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 2266ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 22679566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 22689566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2269ccdfe979SStefano Zampini 2270ccdfe979SStefano Zampini /* product data */ 22719566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2272ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2273afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2274afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2275ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 22769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2277ccdfe979SStefano Zampini } 2278afb2bd1cSJunchao Zhang #endif 2279ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2280ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 22819566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 22829566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2283ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 22849566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2285ccdfe979SStefano Zampini } else { 22869566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2287ccdfe979SStefano Zampini } 2288ccdfe979SStefano Zampini } 2289ccdfe979SStefano Zampini C->product->data = mmdata; 2290ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2291ccdfe979SStefano Zampini 2292ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2293ccdfe979SStefano Zampini PetscFunctionReturn(0); 2294ccdfe979SStefano Zampini } 2295ccdfe979SStefano Zampini 2296fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2297ccdfe979SStefano Zampini { 2298ccdfe979SStefano Zampini Mat_Product *product = C->product; 2299fcdce8c4SStefano Zampini Mat A,B; 2300fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2301fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2302fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2303fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2304fcdce8c4SStefano Zampini PetscBool flg; 2305fcdce8c4SStefano Zampini cusparseStatus_t stat; 2306fcdce8c4SStefano Zampini MatProductType ptype; 2307fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2308fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2309fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2310fcdce8c4SStefano Zampini #endif 2311b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2312ccdfe979SStefano Zampini 2313ccdfe979SStefano Zampini PetscFunctionBegin; 2314ccdfe979SStefano Zampini MatCheckProduct(C,1); 231528b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 23169566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 231728b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2318fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2319fcdce8c4SStefano Zampini A = product->A; 2320fcdce8c4SStefano Zampini B = product->B; 2321fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2322fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2323fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 232408401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2325fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 232628b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2327fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 232828b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2329fcdce8c4SStefano Zampini goto finalize; 2330fcdce8c4SStefano Zampini } 2331fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 23329566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 233328b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 23349566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 233528b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 233628b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 233728b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2338fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2339fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2340fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 234108401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 234208401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 234308401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 23449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 23459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2346fcdce8c4SStefano Zampini 2347fcdce8c4SStefano Zampini ptype = product->type; 2348fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2349fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 235028b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2351fa046f9fSJunchao Zhang } 2352fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2353fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 235428b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2355fa046f9fSJunchao Zhang } 2356fcdce8c4SStefano Zampini switch (ptype) { 2357fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2358fcdce8c4SStefano Zampini Amat = Acusp->mat; 2359fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2360fcdce8c4SStefano Zampini break; 2361fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2362fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2363fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2364fcdce8c4SStefano Zampini break; 2365fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2366fcdce8c4SStefano Zampini Amat = Acusp->mat; 2367fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2368fcdce8c4SStefano Zampini break; 2369fcdce8c4SStefano Zampini default: 237098921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2371fcdce8c4SStefano Zampini } 2372fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 237328b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 237428b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 237528b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2376fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2377fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2378fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 237928b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 238028b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 238128b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 23829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2383fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2384fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 23859566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2386b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2387b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2388b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2389b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 23909566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2391b4285af6SJunchao Zhang #else 2392b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2393fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2394fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 23959566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2396b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2397fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 23989566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2399b4285af6SJunchao Zhang #endif 2400fcdce8c4SStefano Zampini #else 2401b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2402fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2403fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2404fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 24059566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2406fcdce8c4SStefano Zampini #endif 24079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 24089566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 24099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2410fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2411fcdce8c4SStefano Zampini finalize: 2412fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 24139566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 24149566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 24159566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2416fcdce8c4SStefano Zampini c->reallocs = 0; 2417fcdce8c4SStefano Zampini C->info.mallocs += 0; 2418fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2419fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2420fcdce8c4SStefano Zampini C->num_ass++; 2421ccdfe979SStefano Zampini PetscFunctionReturn(0); 2422ccdfe979SStefano Zampini } 2423fcdce8c4SStefano Zampini 2424fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2425fcdce8c4SStefano Zampini { 2426fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2427fcdce8c4SStefano Zampini Mat A,B; 2428fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2429fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2430fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2431fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2432fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2433fcdce8c4SStefano Zampini PetscBool flg; 2434fcdce8c4SStefano Zampini cusparseStatus_t stat; 2435fcdce8c4SStefano Zampini MatProductType ptype; 2436fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2437fcdce8c4SStefano Zampini PetscLogDouble flops; 2438fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2439fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2440fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2441fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2442fcdce8c4SStefano Zampini #else 2443fcdce8c4SStefano Zampini int cnz; 2444fcdce8c4SStefano Zampini #endif 2445b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2446fcdce8c4SStefano Zampini 2447fcdce8c4SStefano Zampini PetscFunctionBegin; 2448fcdce8c4SStefano Zampini MatCheckProduct(C,1); 244928b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2450fcdce8c4SStefano Zampini A = product->A; 2451fcdce8c4SStefano Zampini B = product->B; 24529566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 245328b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 24549566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 245528b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2456fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2457fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2458fcdce8c4SStefano Zampini /* product data */ 24599566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2460fcdce8c4SStefano Zampini C->product->data = mmdata; 2461fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2462fcdce8c4SStefano Zampini 24639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 24649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2465d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2466d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 246708401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 246808401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2469d60bce21SJunchao Zhang 2470fcdce8c4SStefano Zampini ptype = product->type; 2471fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2472fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2473fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2474fa046f9fSJunchao Zhang } 2475fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2476fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2477fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2478fa046f9fSJunchao Zhang } 2479fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2480fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2481fcdce8c4SStefano Zampini switch (ptype) { 2482fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2483fcdce8c4SStefano Zampini m = A->rmap->n; 2484fcdce8c4SStefano Zampini n = B->cmap->n; 2485fcdce8c4SStefano Zampini k = A->cmap->n; 2486fcdce8c4SStefano Zampini Amat = Acusp->mat; 2487fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2488fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2489fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2490fcdce8c4SStefano Zampini break; 2491fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2492fcdce8c4SStefano Zampini m = A->cmap->n; 2493fcdce8c4SStefano Zampini n = B->cmap->n; 2494fcdce8c4SStefano Zampini k = A->rmap->n; 24959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2496fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2497fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2498fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2499fcdce8c4SStefano Zampini break; 2500fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2501fcdce8c4SStefano Zampini m = A->rmap->n; 2502fcdce8c4SStefano Zampini n = B->rmap->n; 2503fcdce8c4SStefano Zampini k = A->cmap->n; 25049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2505fcdce8c4SStefano Zampini Amat = Acusp->mat; 2506fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2507fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2508fcdce8c4SStefano Zampini break; 2509fcdce8c4SStefano Zampini default: 251098921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2511fcdce8c4SStefano Zampini } 2512fcdce8c4SStefano Zampini 2513fcdce8c4SStefano Zampini /* create cusparse matrix */ 25149566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 25159566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2516fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2517fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2518fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2519fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2520fcdce8c4SStefano Zampini 2521fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2522fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2523fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 25249566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 25259566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2526fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2527fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2528fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2529fcdce8c4SStefano Zampini } else { 2530fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2531fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2532fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2533fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2534fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2535fcdce8c4SStefano Zampini } 2536fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2537fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2538fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2539fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2540fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2541fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 25429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 25439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 25449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 25459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 25469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 25479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 25489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2551fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2552fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2553fcdce8c4SStefano Zampini c->nz = 0; 2554fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2555fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2556fcdce8c4SStefano Zampini goto finalizesym; 2557fcdce8c4SStefano Zampini } 2558fcdce8c4SStefano Zampini 255928b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 256028b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2561fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2562fcdce8c4SStefano Zampini if (!biscompressed) { 2563fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2564fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2565fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2566fcdce8c4SStefano Zampini #endif 2567fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2568fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2569fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2570fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2571fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2572fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2573fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2574fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2575fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2576fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2577fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 25789566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2579fcdce8c4SStefano Zampini } 2580fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2581fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2582fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2583fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2584fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2585fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2586fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2587fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 25889566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2589fcdce8c4SStefano Zampini } 2590fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2591fcdce8c4SStefano Zampini #endif 2592fcdce8c4SStefano Zampini } 259328b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 259428b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2595fcdce8c4SStefano Zampini /* precompute flops count */ 2596fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2597fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2598fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2599fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2600fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2601fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2602fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2603fcdce8c4SStefano Zampini } 2604fcdce8c4SStefano Zampini } 2605fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2606fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2607fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2608fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2609fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2610fcdce8c4SStefano Zampini } 2611fcdce8c4SStefano Zampini } else { /* TODO */ 2612fcdce8c4SStefano Zampini flops = 0.; 2613fcdce8c4SStefano Zampini } 2614fcdce8c4SStefano Zampini 2615fcdce8c4SStefano Zampini mmdata->flops = flops; 26169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2617b4285af6SJunchao Zhang 2618fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 26199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2620fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2621fcdce8c4SStefano Zampini NULL, NULL, NULL, 2622fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 26239566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 26249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2625b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2626b4285af6SJunchao Zhang { 2627b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2628b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2629b4285af6SJunchao Zhang */ 2630b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2631b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2632b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2633b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2634b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2635b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2636b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2637b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2638b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2639b4285af6SJunchao Zhang 2640b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2641b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2642b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2643b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26449566063dSJacob Faibussowitsch &bufferSize1, NULL);PetscCallCUSPARSE(stat); 26459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2646b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2647b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2648b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26499566063dSJacob Faibussowitsch &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2650b4285af6SJunchao Zhang 2651b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2652b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2653b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26549566063dSJacob Faibussowitsch &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 26559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 26569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 26579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2658b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2659b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26609566063dSJacob Faibussowitsch &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 26619566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 26629566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 2663b4285af6SJunchao Zhang 2664b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2665b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 26669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2667b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2668b4285af6SJunchao Zhang /* allocate matrix C */ 26699566063dSJacob Faibussowitsch Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 26709566063dSJacob Faibussowitsch Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2671b4285af6SJunchao Zhang /* update matC with the new pointers */ 2672b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 26739566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2674b4285af6SJunchao Zhang 2675b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2676b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2677b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26789566063dSJacob Faibussowitsch &bufferSize5, NULL);PetscCallCUSPARSE(stat); 26799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2680b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2681b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26829566063dSJacob Faibussowitsch &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 26839566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 2684b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2685b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2686b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26879566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 26889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2689b4285af6SJunchao Zhang } 2690ae37ee31SJunchao Zhang #else 2691b4285af6SJunchao Zhang size_t bufSize2; 2692fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2693b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2694fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2695fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26969566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 26979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2698fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2699b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2700fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2701fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27029566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2703fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2704b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2705fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2706fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27079566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2708fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2709fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2710fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2711fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2712fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 27139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2714fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2715b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2716fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2717fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27189566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2719fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 27209566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2721fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 27229566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2723fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 27249566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2725fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 27269566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2727fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 27289566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2729b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2730fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 27319566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2732ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2733fcdce8c4SStefano Zampini #else 27349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2735b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2736fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2737fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2738fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 27399566063dSJacob Faibussowitsch Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2740fcdce8c4SStefano Zampini c->nz = cnz; 2741fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 27429566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2743fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 27449566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2745fcdce8c4SStefano Zampini 27469566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2747fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2748fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2749fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2750b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2751fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2752fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2753fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 27549566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2755fcdce8c4SStefano Zampini #endif 27569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 27579566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2758fcdce8c4SStefano Zampini finalizesym: 2759fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2760fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2761fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 27629566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 27639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 2764fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2765fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2766fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2767fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2768fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2769fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2770fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2773fcdce8c4SStefano Zampini } else { 2774fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2775fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2778fcdce8c4SStefano Zampini } 2779fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2780fcdce8c4SStefano Zampini PetscInt r = 0; 2781fcdce8c4SStefano Zampini c->i[0] = 0; 2782fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2783fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2784fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2785fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2786fcdce8c4SStefano Zampini } 2787fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2788fcdce8c4SStefano Zampini } 27899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 27909566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 27919566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 2792fcdce8c4SStefano Zampini c->maxnz = c->nz; 2793fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2794fcdce8c4SStefano Zampini c->rmax = 0; 2795fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2796fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2797fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2798fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2799fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2800fcdce8c4SStefano Zampini } 28019566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 28029566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 2803fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2804fcdce8c4SStefano Zampini 2805fcdce8c4SStefano Zampini C->nonzerostate++; 28069566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 28079566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 2808fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2809fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2810fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2811fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2812fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2813abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2814fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2815fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2816fcdce8c4SStefano Zampini } 2817fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2818fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2819fcdce8c4SStefano Zampini } 2820fcdce8c4SStefano Zampini 2821fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2822fcdce8c4SStefano Zampini 2823fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2824fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2825fcdce8c4SStefano Zampini { 2826fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2827fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2828fcdce8c4SStefano Zampini 2829fcdce8c4SStefano Zampini PetscFunctionBegin; 2830fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 28319566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2832abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 28339566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2834fcdce8c4SStefano Zampini } 2835fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2836fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2837fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 28389566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2839fcdce8c4SStefano Zampini } 2840fcdce8c4SStefano Zampini } 284165e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 284265e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 284365e4b4d4SStefano Zampini switch (product->type) { 284465e4b4d4SStefano Zampini case MATPRODUCT_AB: 284565e4b4d4SStefano Zampini if (product->api_user) { 2846*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 28479566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2848*d0609cedSBarry Smith PetscOptionsEnd(); 284965e4b4d4SStefano Zampini } else { 2850*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 28519566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2852*d0609cedSBarry Smith PetscOptionsEnd(); 285365e4b4d4SStefano Zampini } 285465e4b4d4SStefano Zampini break; 285565e4b4d4SStefano Zampini case MATPRODUCT_AtB: 285665e4b4d4SStefano Zampini if (product->api_user) { 2857*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 28589566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2859*d0609cedSBarry Smith PetscOptionsEnd(); 286065e4b4d4SStefano Zampini } else { 2861*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 28629566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2863*d0609cedSBarry Smith PetscOptionsEnd(); 286465e4b4d4SStefano Zampini } 286565e4b4d4SStefano Zampini break; 286665e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 286765e4b4d4SStefano Zampini if (product->api_user) { 2868*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 28699566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2870*d0609cedSBarry Smith PetscOptionsEnd(); 287165e4b4d4SStefano Zampini } else { 2872*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 28739566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2874*d0609cedSBarry Smith PetscOptionsEnd(); 287565e4b4d4SStefano Zampini } 287665e4b4d4SStefano Zampini break; 287765e4b4d4SStefano Zampini case MATPRODUCT_RARt: 287865e4b4d4SStefano Zampini if (product->api_user) { 2879*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 28809566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2881*d0609cedSBarry Smith PetscOptionsEnd(); 288265e4b4d4SStefano Zampini } else { 2883*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 28849566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2885*d0609cedSBarry Smith PetscOptionsEnd(); 288665e4b4d4SStefano Zampini } 288765e4b4d4SStefano Zampini break; 288865e4b4d4SStefano Zampini case MATPRODUCT_ABC: 288965e4b4d4SStefano Zampini if (product->api_user) { 2890*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 28919566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2892*d0609cedSBarry Smith PetscOptionsEnd(); 289365e4b4d4SStefano Zampini } else { 2894*d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 28959566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2896*d0609cedSBarry Smith PetscOptionsEnd(); 289765e4b4d4SStefano Zampini } 289865e4b4d4SStefano Zampini break; 289965e4b4d4SStefano Zampini default: 290065e4b4d4SStefano Zampini break; 290165e4b4d4SStefano Zampini } 290265e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 290365e4b4d4SStefano Zampini } 290465e4b4d4SStefano Zampini /* dispatch */ 2905fcdce8c4SStefano Zampini if (isdense) { 2906ccdfe979SStefano Zampini switch (product->type) { 2907ccdfe979SStefano Zampini case MATPRODUCT_AB: 2908ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2909ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2910ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2911ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2912fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 29139566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2914fcdce8c4SStefano Zampini } else { 2915fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2916fcdce8c4SStefano Zampini } 2917fcdce8c4SStefano Zampini break; 2918fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2919fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2920fcdce8c4SStefano Zampini break; 2921ccdfe979SStefano Zampini default: 2922ccdfe979SStefano Zampini break; 2923ccdfe979SStefano Zampini } 2924fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2925fcdce8c4SStefano Zampini switch (product->type) { 2926fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2927fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2928fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2929fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2930fcdce8c4SStefano Zampini break; 2931fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2932fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2933fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2934fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2935fcdce8c4SStefano Zampini break; 2936fcdce8c4SStefano Zampini default: 2937fcdce8c4SStefano Zampini break; 2938fcdce8c4SStefano Zampini } 2939fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 29409566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2941fcdce8c4SStefano Zampini } 2942ccdfe979SStefano Zampini PetscFunctionReturn(0); 2943ccdfe979SStefano Zampini } 2944ccdfe979SStefano Zampini 29456fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 29469ae82921SPaul Mullowney { 29479ae82921SPaul Mullowney PetscFunctionBegin; 29489566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2949e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2950e6e9a74fSStefano Zampini } 2951e6e9a74fSStefano Zampini 2952e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2953e6e9a74fSStefano Zampini { 2954e6e9a74fSStefano Zampini PetscFunctionBegin; 29559566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2956e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2957e6e9a74fSStefano Zampini } 2958e6e9a74fSStefano Zampini 2959e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2960e6e9a74fSStefano Zampini { 2961e6e9a74fSStefano Zampini PetscFunctionBegin; 29629566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2963e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2964e6e9a74fSStefano Zampini } 2965e6e9a74fSStefano Zampini 2966e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2967e6e9a74fSStefano Zampini { 2968e6e9a74fSStefano Zampini PetscFunctionBegin; 29699566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 29709ae82921SPaul Mullowney PetscFunctionReturn(0); 29719ae82921SPaul Mullowney } 29729ae82921SPaul Mullowney 29736fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2974ca45077fSPaul Mullowney { 2975ca45077fSPaul Mullowney PetscFunctionBegin; 29769566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2977ca45077fSPaul Mullowney PetscFunctionReturn(0); 2978ca45077fSPaul Mullowney } 2979ca45077fSPaul Mullowney 2980a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2981a0e72f99SJunchao Zhang { 2982a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2983a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2984a0e72f99SJunchao Zhang } 2985a0e72f99SJunchao Zhang 2986afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2987e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 29889ae82921SPaul Mullowney { 29899ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2990aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 29919ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2992e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2993e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2994e6e9a74fSStefano Zampini PetscBool compressed; 2995afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2996afb2bd1cSJunchao Zhang PetscInt nx,ny; 2997afb2bd1cSJunchao Zhang #endif 29986e111a19SKarl Rupp 29999ae82921SPaul Mullowney PetscFunctionBegin; 300008401ef6SPierre Jolivet PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3001cbc6b225SStefano Zampini if (!a->nz) { 30029566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 30039566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3004e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3005e6e9a74fSStefano Zampini } 300634d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 30079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3008e6e9a74fSStefano Zampini if (!trans) { 30099ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 30105f80ce2aSJacob Faibussowitsch PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3011e6e9a74fSStefano Zampini } else { 30121a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3013e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3014e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3015e6e9a74fSStefano Zampini } else { 30169566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3017e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3018e6e9a74fSStefano Zampini } 3019e6e9a74fSStefano Zampini } 3020e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3021e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3022213423ffSJunchao Zhang 3023e6e9a74fSStefano Zampini try { 30249566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 30259566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 30269566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3027afb2bd1cSJunchao Zhang 30289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3029e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3030afb2bd1cSJunchao Zhang /* z = A x + beta y. 3031afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3032afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3033afb2bd1cSJunchao Zhang */ 3034e6e9a74fSStefano Zampini xptr = xarray; 3035afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3036213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3038afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3039afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3040afb2bd1cSJunchao Zhang */ 3041afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3042afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3043afb2bd1cSJunchao Zhang nx = mat->num_cols; 3044afb2bd1cSJunchao Zhang ny = mat->num_rows; 3045afb2bd1cSJunchao Zhang } 3046afb2bd1cSJunchao Zhang #endif 3047e6e9a74fSStefano Zampini } else { 3048afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3049afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3050afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3051afb2bd1cSJunchao Zhang */ 3052afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3053e6e9a74fSStefano Zampini dptr = zarray; 3054e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3055afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3056e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3057a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3058e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3059e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3060e6e9a74fSStefano Zampini } 3061afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3062afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3063afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3064afb2bd1cSJunchao Zhang nx = mat->num_rows; 3065afb2bd1cSJunchao Zhang ny = mat->num_cols; 3066afb2bd1cSJunchao Zhang } 3067afb2bd1cSJunchao Zhang #endif 3068e6e9a74fSStefano Zampini } 30699ae82921SPaul Mullowney 3070afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3071aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3072afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 30735f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3074afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 30759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 30769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 30779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3078afb2bd1cSJunchao Zhang matstruct->matDescr, 3079afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3080afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3081afb2bd1cSJunchao Zhang cusparse_scalartype, 3082afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 30835f80ce2aSJacob Faibussowitsch &matstruct->cuSpMV[opA].spmvBufferSize)); 30849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3085afb2bd1cSJunchao Zhang 3086afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3087afb2bd1cSJunchao Zhang } else { 3088afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 30899566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 30909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3091afb2bd1cSJunchao Zhang } 3092afb2bd1cSJunchao Zhang 30939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3094afb2bd1cSJunchao Zhang matstruct->alpha_one, 30953606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3096afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3097afb2bd1cSJunchao Zhang beta, 3098afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3099afb2bd1cSJunchao Zhang cusparse_scalartype, 3100afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 31015f80ce2aSJacob Faibussowitsch matstruct->cuSpMV[opA].spmvBuffer)); 3102afb2bd1cSJunchao Zhang #else 31037656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 31049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3105a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3106afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3107aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3108e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 31095f80ce2aSJacob Faibussowitsch dptr)); 3110afb2bd1cSJunchao Zhang #endif 3111aa372e3fSPaul Mullowney } else { 3112213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3113afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3114afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3115afb2bd1cSJunchao Zhang #else 3116301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 31179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3118afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3119e6e9a74fSStefano Zampini xptr, beta, 31205f80ce2aSJacob Faibussowitsch dptr)); 3121afb2bd1cSJunchao Zhang #endif 3122a65300a6SPaul Mullowney } 3123aa372e3fSPaul Mullowney } 31249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3125aa372e3fSPaul Mullowney 3126e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3127213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3128213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 31299566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3130e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 31319566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 31327656d835SStefano Zampini } 3133213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 31349566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz,0)); 31357656d835SStefano Zampini } 31367656d835SStefano Zampini 3137213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3138213423ffSJunchao Zhang if (compressed) { 31399566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3140a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3141a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3142a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3143a0e72f99SJunchao Zhang */ 3144a0e72f99SJunchao Zhang #if 0 3145a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3146a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3147a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3148e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3149c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3150a0e72f99SJunchao Zhang #else 3151a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3152a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3153a0e72f99SJunchao Zhang #endif 31549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3155e6e9a74fSStefano Zampini } 3156e6e9a74fSStefano Zampini } else { 3157e6e9a74fSStefano Zampini if (yy && yy != zz) { 31589566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3159e6e9a74fSStefano Zampini } 3160e6e9a74fSStefano Zampini } 31619566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 31629566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 31639566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 31649ae82921SPaul Mullowney } catch(char *ex) { 316598921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 31669ae82921SPaul Mullowney } 3167e6e9a74fSStefano Zampini if (yy) { 31689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3169e6e9a74fSStefano Zampini } else { 31709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3171e6e9a74fSStefano Zampini } 31729ae82921SPaul Mullowney PetscFunctionReturn(0); 31739ae82921SPaul Mullowney } 31749ae82921SPaul Mullowney 31756fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3176ca45077fSPaul Mullowney { 3177ca45077fSPaul Mullowney PetscFunctionBegin; 31789566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3179ca45077fSPaul Mullowney PetscFunctionReturn(0); 3180ca45077fSPaul Mullowney } 3181ca45077fSPaul Mullowney 31826fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 31839ae82921SPaul Mullowney { 3184042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3185042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 31863fa6b06aSMark Adams 3187042217e8SBarry Smith PetscFunctionBegin; 31889566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3189042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3190042217e8SBarry Smith 31919566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 31929566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3193042217e8SBarry Smith cusp->deviceMat = NULL; 3194042217e8SBarry Smith } 31959ae82921SPaul Mullowney PetscFunctionReturn(0); 31969ae82921SPaul Mullowney } 31979ae82921SPaul Mullowney 31989ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3199e057df02SPaul Mullowney /*@ 32009ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3201e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3202e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3203e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3204e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3205e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32069ae82921SPaul Mullowney 3207d083f849SBarry Smith Collective 32089ae82921SPaul Mullowney 32099ae82921SPaul Mullowney Input Parameters: 32109ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32119ae82921SPaul Mullowney . m - number of rows 32129ae82921SPaul Mullowney . n - number of columns 32139ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32149ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32150298fd71SBarry Smith (possibly different for each row) or NULL 32169ae82921SPaul Mullowney 32179ae82921SPaul Mullowney Output Parameter: 32189ae82921SPaul Mullowney . A - the matrix 32199ae82921SPaul Mullowney 32209ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 32219ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 32229ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 32239ae82921SPaul Mullowney 32249ae82921SPaul Mullowney Notes: 32259ae82921SPaul Mullowney If nnz is given then nz is ignored 32269ae82921SPaul Mullowney 32279ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 32289ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 32299ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 32309ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 32319ae82921SPaul Mullowney 32329ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 32330298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 32349ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 32359ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 32369ae82921SPaul Mullowney 32379ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 32389ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 32399ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 32409ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 32419ae82921SPaul Mullowney 32429ae82921SPaul Mullowney Level: intermediate 32439ae82921SPaul Mullowney 3244e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 32459ae82921SPaul Mullowney @*/ 32469ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 32479ae82921SPaul Mullowney { 32489ae82921SPaul Mullowney PetscFunctionBegin; 32499566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 32509566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 32519566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 32529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 32539ae82921SPaul Mullowney PetscFunctionReturn(0); 32549ae82921SPaul Mullowney } 32559ae82921SPaul Mullowney 32566fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 32579ae82921SPaul Mullowney { 32589ae82921SPaul Mullowney PetscFunctionBegin; 32599ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 32609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 32619ae82921SPaul Mullowney } else { 32629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3263aa372e3fSPaul Mullowney } 32649566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 32659566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 32669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 32679566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 32689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 32699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 32709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 32719566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 32729566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 32739566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 32749566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 32759ae82921SPaul Mullowney PetscFunctionReturn(0); 32769ae82921SPaul Mullowney } 32779ae82921SPaul Mullowney 3278ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 327995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 32809ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 32819ff858a8SKarl Rupp { 32829ff858a8SKarl Rupp PetscFunctionBegin; 32839566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 32849566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 32859ff858a8SKarl Rupp PetscFunctionReturn(0); 32869ff858a8SKarl Rupp } 32879ff858a8SKarl Rupp 3288039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 328995639643SRichard Tran Mills { 3290a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3291039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3292039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3293039c6fbaSStefano Zampini PetscScalar *ay; 3294039c6fbaSStefano Zampini const PetscScalar *ax; 3295039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3296e6e9a74fSStefano Zampini 329795639643SRichard Tran Mills PetscFunctionBegin; 3298a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3299a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3300039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 33019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33029566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3303a587d139SMark PetscFunctionReturn(0); 330495639643SRichard Tran Mills } 3305039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 33069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 33079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 33085f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 33095f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3310039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3311039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3312039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3313039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3314039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3315039c6fbaSStefano Zampini if (eq) { 3316039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3317039c6fbaSStefano Zampini } 3318039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3319039c6fbaSStefano Zampini } 3320d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3321d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3322039c6fbaSStefano Zampini 3323039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3324039c6fbaSStefano Zampini PetscScalar b = 1.0; 3325039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3326039c6fbaSStefano Zampini size_t bufferSize; 3327039c6fbaSStefano Zampini void *buffer; 3328039c6fbaSStefano Zampini #endif 3329039c6fbaSStefano Zampini 33309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3333039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3335039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3336039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33375f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 33389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 33399566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3341039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3342039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33435f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 33449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 33459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33469566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3347039c6fbaSStefano Zampini #else 33489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3350039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3351039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33525f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 33539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 33549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3355039c6fbaSStefano Zampini #endif 33569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 33579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3360039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3361a587d139SMark cublasHandle_t cublasv2handle; 3362a587d139SMark PetscBLASInt one = 1, bnz = 1; 3363039c6fbaSStefano Zampini 33649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33669566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 33679566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz,&bnz)); 33689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33699566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 33709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*bnz)); 33719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3375039c6fbaSStefano Zampini } else { 33769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33779566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3378a587d139SMark } 337995639643SRichard Tran Mills PetscFunctionReturn(0); 338095639643SRichard Tran Mills } 338195639643SRichard Tran Mills 338233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 338333c9ba73SStefano Zampini { 338433c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 338533c9ba73SStefano Zampini PetscScalar *ay; 338633c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 338733c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 338833c9ba73SStefano Zampini 338933c9ba73SStefano Zampini PetscFunctionBegin; 33909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33919566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 33929566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz,&bnz)); 33939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33949566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 33959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 33969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 339933c9ba73SStefano Zampini PetscFunctionReturn(0); 340033c9ba73SStefano Zampini } 340133c9ba73SStefano Zampini 34023fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34033fa6b06aSMark Adams { 34047e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3405a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34067e8381f9SStefano Zampini 34073fa6b06aSMark Adams PetscFunctionBegin; 34083fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 34093fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 34107e8381f9SStefano Zampini if (spptr->mat) { 34117e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 34127e8381f9SStefano Zampini if (matrix->values) { 34137e8381f9SStefano Zampini both = PETSC_TRUE; 34147e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34157e8381f9SStefano Zampini } 34167e8381f9SStefano Zampini } 34177e8381f9SStefano Zampini if (spptr->matTranspose) { 34187e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 34197e8381f9SStefano Zampini if (matrix->values) { 34207e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34217e8381f9SStefano Zampini } 34227e8381f9SStefano Zampini } 34233fa6b06aSMark Adams } 34249566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 34259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 34267e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3427a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 34283fa6b06aSMark Adams PetscFunctionReturn(0); 34293fa6b06aSMark Adams } 34303fa6b06aSMark Adams 3431a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3432a587d139SMark { 3433a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3434a587d139SMark 3435a587d139SMark PetscFunctionBegin; 34369a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 34379a14fc28SStefano Zampini A->boundtocpu = flg; 34389a14fc28SStefano Zampini PetscFunctionReturn(0); 34399a14fc28SStefano Zampini } 3440a587d139SMark if (flg) { 34419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3442a587d139SMark 344333c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3444a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3445a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3446a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3447a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3448a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3449a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3450a587d139SMark A->ops->multhermitiantranspose = NULL; 3451a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3452fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 34539566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 34549566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 34559566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 34569566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 34579566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 34589566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 34599566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 34609566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3461a587d139SMark } else { 346233c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3463a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3464a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3465a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3466a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3467a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3468a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3469a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3470a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3471fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 347267a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 347367a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 347467a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 347567a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 347667a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 347767a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 34787ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 34797ee59b9bSJunchao Zhang 34809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 34819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34829566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34839566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 34849566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 34859566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3486a587d139SMark } 3487a587d139SMark A->boundtocpu = flg; 3488ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3489ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3490ea500dcfSRichard Tran Mills } else { 3491ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3492ea500dcfSRichard Tran Mills } 3493a587d139SMark PetscFunctionReturn(0); 3494a587d139SMark } 3495a587d139SMark 349649735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 34979ae82921SPaul Mullowney { 349849735bf3SStefano Zampini Mat B; 34999ae82921SPaul Mullowney 35009ae82921SPaul Mullowney PetscFunctionBegin; 35019566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 350249735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 35039566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 350449735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 35059566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 350649735bf3SStefano Zampini } 350749735bf3SStefano Zampini B = *newmat; 350849735bf3SStefano Zampini 35099566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 35109566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 351134136279SStefano Zampini 351249735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 35139ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3514e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 35159566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 35169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 35179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 35181a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3519d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 35208efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3521a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3522a435da06SStefano Zampini #else 3523d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3524a435da06SStefano Zampini #endif 3525d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3526d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3527d8132acaSStefano Zampini #endif 35281a2c6b5cSJunchao Zhang B->spptr = spptr; 35299ae82921SPaul Mullowney } else { 3530e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3531e6e9a74fSStefano Zampini 35329566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 35339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 35349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3535e6e9a74fSStefano Zampini B->spptr = spptr; 35369ae82921SPaul Mullowney } 3537e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 353849735bf3SStefano Zampini } 3539693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 35409ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 35411a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 35429ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 354395639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3544693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 35452205254eSKarl Rupp 35469566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 35479566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 35489566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3549ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 35509566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3551ae48a8d0SStefano Zampini #endif 35529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 35539ae82921SPaul Mullowney PetscFunctionReturn(0); 35549ae82921SPaul Mullowney } 35559ae82921SPaul Mullowney 355602fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 355702fe1965SBarry Smith { 355802fe1965SBarry Smith PetscFunctionBegin; 35599566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 35609566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 356102fe1965SBarry Smith PetscFunctionReturn(0); 356202fe1965SBarry Smith } 356302fe1965SBarry Smith 35643ca39a21SBarry Smith /*MC 3565e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3566e057df02SPaul Mullowney 3567e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 35682692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 35692692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3570e057df02SPaul Mullowney 3571e057df02SPaul Mullowney Options Database Keys: 3572e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3573aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3574a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3575365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3576e057df02SPaul Mullowney 3577e057df02SPaul Mullowney Level: beginner 3578e057df02SPaul Mullowney 35798468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3580e057df02SPaul Mullowney M*/ 35817f756511SDominic Meiser 3582bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 35830f39cd5aSBarry Smith 35843ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 358542c9c57cSBarry Smith { 358642c9c57cSBarry Smith PetscFunctionBegin; 35879566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 35889566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 35899566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 35909566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 35919566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3592bddcd29dSMark Adams 359342c9c57cSBarry Smith PetscFunctionReturn(0); 359442c9c57cSBarry Smith } 359529b38603SBarry Smith 3596cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3597cbc6b225SStefano Zampini { 3598cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3599cbc6b225SStefano Zampini 3600cbc6b225SStefano Zampini PetscFunctionBegin; 3601cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3602cbc6b225SStefano Zampini delete cusp->cooPerm; 3603cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3604cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3605cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3606cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 36079566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 36089566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 3609cbc6b225SStefano Zampini } 3610cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3611cbc6b225SStefano Zampini PetscFunctionReturn(0); 3612cbc6b225SStefano Zampini } 3613cbc6b225SStefano Zampini 3614470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 36157f756511SDominic Meiser { 36167f756511SDominic Meiser PetscFunctionBegin; 36177f756511SDominic Meiser if (*cusparsestruct) { 36189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 36199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 36207f756511SDominic Meiser delete (*cusparsestruct)->workVector; 362181902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 36227e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 36237e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3624a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 36259566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 36269566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 36279566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 36289566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 36297f756511SDominic Meiser } 36307f756511SDominic Meiser PetscFunctionReturn(0); 36317f756511SDominic Meiser } 36327f756511SDominic Meiser 36337f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 36347f756511SDominic Meiser { 36357f756511SDominic Meiser PetscFunctionBegin; 36367f756511SDominic Meiser if (*mat) { 36377f756511SDominic Meiser delete (*mat)->values; 36387f756511SDominic Meiser delete (*mat)->column_indices; 36397f756511SDominic Meiser delete (*mat)->row_offsets; 36407f756511SDominic Meiser delete *mat; 36417f756511SDominic Meiser *mat = 0; 36427f756511SDominic Meiser } 36437f756511SDominic Meiser PetscFunctionReturn(0); 36447f756511SDominic Meiser } 36457f756511SDominic Meiser 3646470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 36477f756511SDominic Meiser { 36487f756511SDominic Meiser PetscFunctionBegin; 36497f756511SDominic Meiser if (*trifactor) { 36509566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 36519566063dSJacob Faibussowitsch if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 36529566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 36539566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 36549566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3655afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 36569566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3657afb2bd1cSJunchao Zhang #endif 36589566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 36597f756511SDominic Meiser } 36607f756511SDominic Meiser PetscFunctionReturn(0); 36617f756511SDominic Meiser } 36627f756511SDominic Meiser 3663470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 36647f756511SDominic Meiser { 36657f756511SDominic Meiser CsrMatrix *mat; 36667f756511SDominic Meiser 36677f756511SDominic Meiser PetscFunctionBegin; 36687f756511SDominic Meiser if (*matstruct) { 36697f756511SDominic Meiser if ((*matstruct)->mat) { 36707f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3671afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3672afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3673afb2bd1cSJunchao Zhang #else 36747f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 36759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3676afb2bd1cSJunchao Zhang #endif 36777f756511SDominic Meiser } else { 36787f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 36797f756511SDominic Meiser CsrMatrix_Destroy(&mat); 36807f756511SDominic Meiser } 36817f756511SDominic Meiser } 36829566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 36837f756511SDominic Meiser delete (*matstruct)->cprowIndices; 36849566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 36859566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 36869566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3687afb2bd1cSJunchao Zhang 3688afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3689afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 36909566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3691afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3692afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 36939566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 36949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 36959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3696afb2bd1cSJunchao Zhang } 3697afb2bd1cSJunchao Zhang } 3698afb2bd1cSJunchao Zhang #endif 36997f756511SDominic Meiser delete *matstruct; 37007e8381f9SStefano Zampini *matstruct = NULL; 37017f756511SDominic Meiser } 37027f756511SDominic Meiser PetscFunctionReturn(0); 37037f756511SDominic Meiser } 37047f756511SDominic Meiser 3705e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 37067f756511SDominic Meiser { 37077f756511SDominic Meiser PetscFunctionBegin; 37087f756511SDominic Meiser if (*trifactors) { 37099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 37109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 37119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 37129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 37137f756511SDominic Meiser delete (*trifactors)->rpermIndices; 37147f756511SDominic Meiser delete (*trifactors)->cpermIndices; 37157f756511SDominic Meiser delete (*trifactors)->workVector; 37167e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 37177e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 37187e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 37199566063dSJacob Faibussowitsch if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 37209566063dSJacob Faibussowitsch if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3721e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3722ccdfe979SStefano Zampini } 3723ccdfe979SStefano Zampini PetscFunctionReturn(0); 3724ccdfe979SStefano Zampini } 3725ccdfe979SStefano Zampini 3726ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3727ccdfe979SStefano Zampini { 3728ccdfe979SStefano Zampini cusparseHandle_t handle; 3729ccdfe979SStefano Zampini 3730ccdfe979SStefano Zampini PetscFunctionBegin; 3731ccdfe979SStefano Zampini if (*trifactors) { 37329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 37337f756511SDominic Meiser if (handle = (*trifactors)->handle) { 37349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroy(handle)); 37357f756511SDominic Meiser } 37369566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 37377f756511SDominic Meiser } 37387f756511SDominic Meiser PetscFunctionReturn(0); 37397f756511SDominic Meiser } 37407e8381f9SStefano Zampini 37417e8381f9SStefano Zampini struct IJCompare 37427e8381f9SStefano Zampini { 37437e8381f9SStefano Zampini __host__ __device__ 37447e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37457e8381f9SStefano Zampini { 37467e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 37477e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 37487e8381f9SStefano Zampini return false; 37497e8381f9SStefano Zampini } 37507e8381f9SStefano Zampini }; 37517e8381f9SStefano Zampini 37527e8381f9SStefano Zampini struct IJEqual 37537e8381f9SStefano Zampini { 37547e8381f9SStefano Zampini __host__ __device__ 37557e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37567e8381f9SStefano Zampini { 37577e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 37587e8381f9SStefano Zampini return true; 37597e8381f9SStefano Zampini } 37607e8381f9SStefano Zampini }; 37617e8381f9SStefano Zampini 37627e8381f9SStefano Zampini struct IJDiff 37637e8381f9SStefano Zampini { 37647e8381f9SStefano Zampini __host__ __device__ 37657e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37667e8381f9SStefano Zampini { 37677e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 37687e8381f9SStefano Zampini } 37697e8381f9SStefano Zampini }; 37707e8381f9SStefano Zampini 37717e8381f9SStefano Zampini struct IJSum 37727e8381f9SStefano Zampini { 37737e8381f9SStefano Zampini __host__ __device__ 37747e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37757e8381f9SStefano Zampini { 37767e8381f9SStefano Zampini return t1||t2; 37777e8381f9SStefano Zampini } 37787e8381f9SStefano Zampini }; 37797e8381f9SStefano Zampini 37807e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3781219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3782219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 37837e8381f9SStefano Zampini { 37847e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3785fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3786bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 378708391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 37887e8381f9SStefano Zampini CsrMatrix *matrix; 37897e8381f9SStefano Zampini PetscInt n; 37907e8381f9SStefano Zampini 37917e8381f9SStefano Zampini PetscFunctionBegin; 379228b400f6SJacob Faibussowitsch PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 379328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 37947e8381f9SStefano Zampini if (!cusp->cooPerm) { 37959566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 37969566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 37977e8381f9SStefano Zampini PetscFunctionReturn(0); 37987e8381f9SStefano Zampini } 37997e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 380028b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3801e61fc153SStefano Zampini if (!v) { 3802e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3803e61fc153SStefano Zampini goto finalize; 38047e8381f9SStefano Zampini } 3805e61fc153SStefano Zampini n = cusp->cooPerm->size(); 380608391a17SStefano Zampini if (isCudaMem(v)) { 380708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 380808391a17SStefano Zampini } else { 3809e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3810e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 381108391a17SStefano Zampini d_v = cooPerm_v->data(); 38129566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 381308391a17SStefano Zampini } 38149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3815e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3816ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3817bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 381808391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3819ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3820ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3821ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3822ddea5d60SJunchao Zhang */ 3823e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3824e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3825e61fc153SStefano Zampini delete cooPerm_w; 38267e8381f9SStefano Zampini } else { 3827ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 382808391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38297e8381f9SStefano Zampini matrix->values->begin())); 383008391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38317e8381f9SStefano Zampini matrix->values->end())); 3832ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 38337e8381f9SStefano Zampini } 38347e8381f9SStefano Zampini } else { 3835e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 383608391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3837e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 38387e8381f9SStefano Zampini } else { 383908391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38407e8381f9SStefano Zampini matrix->values->begin())); 384108391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38427e8381f9SStefano Zampini matrix->values->end())); 38437e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 38447e8381f9SStefano Zampini } 38457e8381f9SStefano Zampini } 38469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3847e61fc153SStefano Zampini finalize: 3848e61fc153SStefano Zampini delete cooPerm_v; 38497e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 38509566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3851fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 38529566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 38539566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 38549566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3855fcdce8c4SStefano Zampini a->reallocs = 0; 3856fcdce8c4SStefano Zampini A->info.mallocs += 0; 3857fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3858fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3859fcdce8c4SStefano Zampini A->num_ass++; 38607e8381f9SStefano Zampini PetscFunctionReturn(0); 38617e8381f9SStefano Zampini } 38627e8381f9SStefano Zampini 3863a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3864a49f1ed0SStefano Zampini { 3865a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3866a49f1ed0SStefano Zampini 3867a49f1ed0SStefano Zampini PetscFunctionBegin; 3868a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3869a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3870a49f1ed0SStefano Zampini if (destroy) { 38719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3872a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3873a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3874a49f1ed0SStefano Zampini } 38751a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3876a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3877a49f1ed0SStefano Zampini } 3878a49f1ed0SStefano Zampini 38797e8381f9SStefano Zampini #include <thrust/binary_search.h> 3880219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3881219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 38827e8381f9SStefano Zampini { 38837e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 38847e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 38857e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 38867e8381f9SStefano Zampini 38877e8381f9SStefano Zampini PetscFunctionBegin; 38889566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 38899566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 38907e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 38917e8381f9SStefano Zampini if (n != cooPerm_n) { 38927e8381f9SStefano Zampini delete cusp->cooPerm; 38937e8381f9SStefano Zampini delete cusp->cooPerm_a; 38947e8381f9SStefano Zampini cusp->cooPerm = NULL; 38957e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 38967e8381f9SStefano Zampini } 38977e8381f9SStefano Zampini if (n) { 38987e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 38997e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 39007e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 39017e8381f9SStefano Zampini 39027e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 39037e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 39047e8381f9SStefano Zampini 39059566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 39067e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 39077e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 3908ddea5d60SJunchao Zhang 3909ddea5d60SJunchao Zhang /* Ex. 3910ddea5d60SJunchao Zhang n = 6 3911ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 3912ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 3913ddea5d60SJunchao Zhang */ 39147e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 39157e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 39167e8381f9SStefano Zampini 39179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39187e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3919ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3920ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 39217e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 39227e8381f9SStefano Zampini 3923ddea5d60SJunchao Zhang /* 3924ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 3925ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 3926ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 3927ddea5d60SJunchao Zhang */ 3928ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3929ddea5d60SJunchao Zhang 3930ddea5d60SJunchao Zhang /* 3931ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 3932ddea5d60SJunchao Zhang ^ekey 3933ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 3934ddea5d60SJunchao Zhang ^nekye 3935ddea5d60SJunchao Zhang */ 39367e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 39377e8381f9SStefano Zampini delete cusp->cooPerm_a; 39387e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 3939ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3940ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3941ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3942ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3943ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 39447e8381f9SStefano Zampini w[0] = 0; 3945ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3946ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 39477e8381f9SStefano Zampini } 39487e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 3949ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3950ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3951ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 39529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39537e8381f9SStefano Zampini 39549566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 39557e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 39567e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 39577e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 39589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3959ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 39609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39617e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3962fcdce8c4SStefano Zampini a->rmax = 0; 39639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->a)); 39649566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->j)); 39659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39669566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 39679566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 39687e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 39697e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 39707e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 39717e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3972fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 39737e8381f9SStefano Zampini } 3974fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 39757e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 39769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 39779566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 39787e8381f9SStefano Zampini } else { 39799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 39807e8381f9SStefano Zampini } 39819566063dSJacob Faibussowitsch PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 39827e8381f9SStefano Zampini 39837e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3984e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 39859566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->nz)); 39869566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 39877e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 39889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 39899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 39907e8381f9SStefano Zampini PetscFunctionReturn(0); 39917e8381f9SStefano Zampini } 3992ed502f03SStefano Zampini 3993219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3994219fbbafSJunchao Zhang { 3995219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 3996219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 3997cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 3998219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3999219fbbafSJunchao Zhang 4000219fbbafSJunchao Zhang PetscFunctionBegin; 40019566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 40029566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4003219fbbafSJunchao Zhang if (coo_i) { 40049566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 4005219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4006219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { 4007cbc6b225SStefano Zampini if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4008219fbbafSJunchao Zhang } 4009219fbbafSJunchao Zhang } 4010219fbbafSJunchao Zhang } 4011219fbbafSJunchao Zhang 4012219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 40139566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4014219fbbafSJunchao Zhang } else { 40159566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4016cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 40179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4018219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ*>(mat->data); 4019219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 40209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 40219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 40229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 40239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4024219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4025219fbbafSJunchao Zhang } 4026219fbbafSJunchao Zhang PetscFunctionReturn(0); 4027219fbbafSJunchao Zhang } 4028219fbbafSJunchao Zhang 4029b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4030219fbbafSJunchao Zhang { 4031219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4032219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4033b6c38306SJunchao Zhang for (; i<nnz; i+= grid_size) { 4034b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4035b6c38306SJunchao Zhang for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4036b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4037b6c38306SJunchao Zhang } 4038219fbbafSJunchao Zhang } 4039219fbbafSJunchao Zhang 4040219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4041219fbbafSJunchao Zhang { 4042219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4043219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4044219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4045219fbbafSJunchao Zhang PetscMemType memtype; 4046219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4047219fbbafSJunchao Zhang PetscScalar *Aa; 4048219fbbafSJunchao Zhang 4049219fbbafSJunchao Zhang PetscFunctionBegin; 4050219fbbafSJunchao Zhang if (dev->use_extended_coo) { 40519566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 4052219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 40539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 40549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4055219fbbafSJunchao Zhang } 4056219fbbafSJunchao Zhang 40579566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 40589566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4059219fbbafSJunchao Zhang 4060cbc6b225SStefano Zampini if (Annz) { 4061b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 40629566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4063cbc6b225SStefano Zampini } 4064219fbbafSJunchao Zhang 40659566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 40669566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4067219fbbafSJunchao Zhang 40689566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4069219fbbafSJunchao Zhang } else { 40709566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4071219fbbafSJunchao Zhang } 4072219fbbafSJunchao Zhang PetscFunctionReturn(0); 4073219fbbafSJunchao Zhang } 4074219fbbafSJunchao Zhang 40755b7e41feSStefano Zampini /*@C 40765b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 40775b7e41feSStefano Zampini 40785b7e41feSStefano Zampini Not collective 40795b7e41feSStefano Zampini 40805b7e41feSStefano Zampini Input Parameters: 40815b7e41feSStefano Zampini + A - the matrix 40825b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40835b7e41feSStefano Zampini 40845b7e41feSStefano Zampini Output Parameters: 40855b7e41feSStefano Zampini + ia - the CSR row pointers 40865b7e41feSStefano Zampini - ja - the CSR column indices 40875b7e41feSStefano Zampini 40885b7e41feSStefano Zampini Level: developer 40895b7e41feSStefano Zampini 40905b7e41feSStefano Zampini Notes: 40915b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 40925b7e41feSStefano Zampini 40935b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 40945b7e41feSStefano Zampini @*/ 40955f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 40965f101d05SStefano Zampini { 40975f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 40985f101d05SStefano Zampini CsrMatrix *csr; 40995f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41005f101d05SStefano Zampini 41015f101d05SStefano Zampini PetscFunctionBegin; 41025f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41035f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 41045f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41052c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 410728b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 41085f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 41095f101d05SStefano Zampini if (i) { 41105f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 41115f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 41125f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 41135f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 41149566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 41155f101d05SStefano Zampini } 41165f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 41175f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 41185f101d05SStefano Zampini } 41195f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 41205f101d05SStefano Zampini PetscFunctionReturn(0); 41215f101d05SStefano Zampini } 41225f101d05SStefano Zampini 41235b7e41feSStefano Zampini /*@C 41245b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 41255b7e41feSStefano Zampini 41265b7e41feSStefano Zampini Not collective 41275b7e41feSStefano Zampini 41285b7e41feSStefano Zampini Input Parameters: 41295b7e41feSStefano Zampini + A - the matrix 41305b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 41315b7e41feSStefano Zampini 41325b7e41feSStefano Zampini Output Parameters: 41335b7e41feSStefano Zampini + ia - the CSR row pointers 41345b7e41feSStefano Zampini - ja - the CSR column indices 41355b7e41feSStefano Zampini 41365b7e41feSStefano Zampini Level: developer 41375b7e41feSStefano Zampini 41385b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ() 41395b7e41feSStefano Zampini @*/ 41405f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41415f101d05SStefano Zampini { 41425f101d05SStefano Zampini PetscFunctionBegin; 41435f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41445f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41455f101d05SStefano Zampini if (i) *i = NULL; 41465f101d05SStefano Zampini if (j) *j = NULL; 41475f101d05SStefano Zampini PetscFunctionReturn(0); 41485f101d05SStefano Zampini } 41495f101d05SStefano Zampini 41505b7e41feSStefano Zampini /*@C 41515b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41525b7e41feSStefano Zampini 41535b7e41feSStefano Zampini Not Collective 41545b7e41feSStefano Zampini 41555b7e41feSStefano Zampini Input Parameter: 41565b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41575b7e41feSStefano Zampini 41585b7e41feSStefano Zampini Output Parameter: 41595b7e41feSStefano Zampini . a - pointer to the device data 41605b7e41feSStefano Zampini 41615b7e41feSStefano Zampini Level: developer 41625b7e41feSStefano Zampini 41635b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41645b7e41feSStefano Zampini 41655b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 41665b7e41feSStefano Zampini @*/ 4167ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4168ed502f03SStefano Zampini { 4169ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4170ed502f03SStefano Zampini CsrMatrix *csr; 4171ed502f03SStefano Zampini 4172ed502f03SStefano Zampini PetscFunctionBegin; 4173ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4174ed502f03SStefano Zampini PetscValidPointer(a,2); 4175ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41762c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 417828b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4179ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 418028b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4181ed502f03SStefano Zampini *a = csr->values->data().get(); 4182ed502f03SStefano Zampini PetscFunctionReturn(0); 4183ed502f03SStefano Zampini } 4184ed502f03SStefano Zampini 41855b7e41feSStefano Zampini /*@C 41865b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 41875b7e41feSStefano Zampini 41885b7e41feSStefano Zampini Not Collective 41895b7e41feSStefano Zampini 41905b7e41feSStefano Zampini Input Parameter: 41915b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41925b7e41feSStefano Zampini 41935b7e41feSStefano Zampini Output Parameter: 41945b7e41feSStefano Zampini . a - pointer to the device data 41955b7e41feSStefano Zampini 41965b7e41feSStefano Zampini Level: developer 41975b7e41feSStefano Zampini 41985b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead() 41995b7e41feSStefano Zampini @*/ 4200ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4201ed502f03SStefano Zampini { 4202ed502f03SStefano Zampini PetscFunctionBegin; 4203ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4204ed502f03SStefano Zampini PetscValidPointer(a,2); 4205ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4206ed502f03SStefano Zampini *a = NULL; 4207ed502f03SStefano Zampini PetscFunctionReturn(0); 4208ed502f03SStefano Zampini } 4209ed502f03SStefano Zampini 42105b7e41feSStefano Zampini /*@C 42115b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42125b7e41feSStefano Zampini 42135b7e41feSStefano Zampini Not Collective 42145b7e41feSStefano Zampini 42155b7e41feSStefano Zampini Input Parameter: 42165b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42175b7e41feSStefano Zampini 42185b7e41feSStefano Zampini Output Parameter: 42195b7e41feSStefano Zampini . a - pointer to the device data 42205b7e41feSStefano Zampini 42215b7e41feSStefano Zampini Level: developer 42225b7e41feSStefano Zampini 42235b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 42245b7e41feSStefano Zampini 42255b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 42265b7e41feSStefano Zampini @*/ 4227039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4228039c6fbaSStefano Zampini { 4229039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4230039c6fbaSStefano Zampini CsrMatrix *csr; 4231039c6fbaSStefano Zampini 4232039c6fbaSStefano Zampini PetscFunctionBegin; 4233039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4234039c6fbaSStefano Zampini PetscValidPointer(a,2); 4235039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42362c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 42379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 423828b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4239039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 424028b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4241039c6fbaSStefano Zampini *a = csr->values->data().get(); 4242039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 42439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4244039c6fbaSStefano Zampini PetscFunctionReturn(0); 4245039c6fbaSStefano Zampini } 42465b7e41feSStefano Zampini /*@C 42475b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4248039c6fbaSStefano Zampini 42495b7e41feSStefano Zampini Not Collective 42505b7e41feSStefano Zampini 42515b7e41feSStefano Zampini Input Parameter: 42525b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42535b7e41feSStefano Zampini 42545b7e41feSStefano Zampini Output Parameter: 42555b7e41feSStefano Zampini . a - pointer to the device data 42565b7e41feSStefano Zampini 42575b7e41feSStefano Zampini Level: developer 42585b7e41feSStefano Zampini 42595b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray() 42605b7e41feSStefano Zampini @*/ 4261039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4262039c6fbaSStefano Zampini { 4263039c6fbaSStefano Zampini PetscFunctionBegin; 4264039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4265039c6fbaSStefano Zampini PetscValidPointer(a,2); 4266039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 42689566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4269039c6fbaSStefano Zampini *a = NULL; 4270039c6fbaSStefano Zampini PetscFunctionReturn(0); 4271039c6fbaSStefano Zampini } 4272039c6fbaSStefano Zampini 42735b7e41feSStefano Zampini /*@C 42745b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42755b7e41feSStefano Zampini 42765b7e41feSStefano Zampini Not Collective 42775b7e41feSStefano Zampini 42785b7e41feSStefano Zampini Input Parameter: 42795b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42805b7e41feSStefano Zampini 42815b7e41feSStefano Zampini Output Parameter: 42825b7e41feSStefano Zampini . a - pointer to the device data 42835b7e41feSStefano Zampini 42845b7e41feSStefano Zampini Level: developer 42855b7e41feSStefano Zampini 42865b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 42875b7e41feSStefano Zampini 42885b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 42895b7e41feSStefano Zampini @*/ 4290ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4291ed502f03SStefano Zampini { 4292ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4293ed502f03SStefano Zampini CsrMatrix *csr; 4294ed502f03SStefano Zampini 4295ed502f03SStefano Zampini PetscFunctionBegin; 4296ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4297ed502f03SStefano Zampini PetscValidPointer(a,2); 4298ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42992c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 430028b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4301ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 430228b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4303ed502f03SStefano Zampini *a = csr->values->data().get(); 4304039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 43059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4306ed502f03SStefano Zampini PetscFunctionReturn(0); 4307ed502f03SStefano Zampini } 4308ed502f03SStefano Zampini 43095b7e41feSStefano Zampini /*@C 43105b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 43115b7e41feSStefano Zampini 43125b7e41feSStefano Zampini Not Collective 43135b7e41feSStefano Zampini 43145b7e41feSStefano Zampini Input Parameter: 43155b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43165b7e41feSStefano Zampini 43175b7e41feSStefano Zampini Output Parameter: 43185b7e41feSStefano Zampini . a - pointer to the device data 43195b7e41feSStefano Zampini 43205b7e41feSStefano Zampini Level: developer 43215b7e41feSStefano Zampini 43225b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 43235b7e41feSStefano Zampini @*/ 4324ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4325ed502f03SStefano Zampini { 4326ed502f03SStefano Zampini PetscFunctionBegin; 4327ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4328ed502f03SStefano Zampini PetscValidPointer(a,2); 4329ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 43309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 43319566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4332ed502f03SStefano Zampini *a = NULL; 4333ed502f03SStefano Zampini PetscFunctionReturn(0); 4334ed502f03SStefano Zampini } 4335ed502f03SStefano Zampini 4336ed502f03SStefano Zampini struct IJCompare4 4337ed502f03SStefano Zampini { 4338ed502f03SStefano Zampini __host__ __device__ 43392ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4340ed502f03SStefano Zampini { 4341ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4342ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4343ed502f03SStefano Zampini return false; 4344ed502f03SStefano Zampini } 4345ed502f03SStefano Zampini }; 4346ed502f03SStefano Zampini 43478909a122SStefano Zampini struct Shift 43488909a122SStefano Zampini { 4349ed502f03SStefano Zampini int _shift; 4350ed502f03SStefano Zampini 4351ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4352ed502f03SStefano Zampini __host__ __device__ 4353ed502f03SStefano Zampini inline int operator() (const int &c) 4354ed502f03SStefano Zampini { 4355ed502f03SStefano Zampini return c + _shift; 4356ed502f03SStefano Zampini } 4357ed502f03SStefano Zampini }; 4358ed502f03SStefano Zampini 4359ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4360ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4361ed502f03SStefano Zampini { 4362ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4363ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4364ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4365ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4366ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4367ed502f03SStefano Zampini cusparseStatus_t stat; 4368ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4369ed502f03SStefano Zampini 4370ed502f03SStefano Zampini PetscFunctionBegin; 4371ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4372ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4373ed502f03SStefano Zampini PetscValidPointer(C,4); 4374ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4375ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 43765f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 437708401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 43782c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 43792c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4380ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4381ed502f03SStefano Zampini m = A->rmap->n; 4382ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 43839566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,C)); 43849566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C,m,n,m,n)); 43859566063dSJacob Faibussowitsch PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4386ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4387ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4388ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4389ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4390ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4391ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4392ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4393ed502f03SStefano Zampini c->compressedrow.i = NULL; 4394ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4395ed502f03SStefano Zampini Ccusp->workVector = NULL; 4396ed502f03SStefano Zampini Ccusp->nrows = m; 4397ed502f03SStefano Zampini Ccusp->mat = Cmat; 4398ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4399ed502f03SStefano Zampini Ccsr->num_rows = m; 4400ed502f03SStefano Zampini Ccsr->num_cols = n; 44019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 44029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 44039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 44049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 44059566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 44069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 44079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44099566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 44119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 441228b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 441328b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4414ed502f03SStefano Zampini 4415ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4416ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4417ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4418ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4419ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4420ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4421ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4422ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4423ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4424ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4425ed502f03SStefano Zampini if (c->nz) { 44262ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 44272ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 44282ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 44292ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 44302ed87e7eSStefano Zampini 4431ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4432ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4433ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4434ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 44359566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4436ed502f03SStefano Zampini } 44372ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 44382ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4439ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4440ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4441ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4442ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 44439566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4444ed502f03SStefano Zampini } 44452ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 44462ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 44479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 44482ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 44492ed87e7eSStefano Zampini Aroff->data().get(), 44502ed87e7eSStefano Zampini Annz, 44512ed87e7eSStefano Zampini m, 44522ed87e7eSStefano Zampini Acoo->data().get(), 44539566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4454ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 44552ed87e7eSStefano Zampini Broff->data().get(), 4456ed502f03SStefano Zampini Bnnz, 4457ed502f03SStefano Zampini m, 44582ed87e7eSStefano Zampini Bcoo->data().get(), 44599566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 44602ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 44612ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 44622ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 44638909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4464ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4465ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 44668909a122SStefano Zampini #else 44678909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 44688909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 44698909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 44708909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 44718909a122SStefano Zampini #endif 44722ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 44732ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 44742ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 44752ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 44762ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 44772ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4478ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4479ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4480ed502f03SStefano Zampini thrust::advance(p2,Annz); 44812ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 44828909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 44838909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 44848909a122SStefano Zampini #endif 44852ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 44862ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 44872ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 44882ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 44892ed87e7eSStefano Zampini #else 44902ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 44912ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 44922ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 44932ed87e7eSStefano Zampini #endif 4494ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 44952ed87e7eSStefano Zampini Ccoo->data().get(), 4496ed502f03SStefano Zampini c->nz, 4497ed502f03SStefano Zampini m, 4498ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 44999566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 45009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 45012ed87e7eSStefano Zampini delete wPerm; 45022ed87e7eSStefano Zampini delete Acoo; 45032ed87e7eSStefano Zampini delete Bcoo; 45042ed87e7eSStefano Zampini delete Ccoo; 4505ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4506ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4507ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4508ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 45099566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4510ed502f03SStefano Zampini #endif 45111a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 45129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 45139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4514ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4515ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4516ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4517ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4518ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4519ed502f03SStefano Zampini 45201a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 45211a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4522a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4523ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4524ed502f03SStefano Zampini CmatT->mat = CcsrT; 4525ed502f03SStefano Zampini CcsrT->num_rows = n; 4526ed502f03SStefano Zampini CcsrT->num_cols = m; 4527ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4528ed502f03SStefano Zampini 4529ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4530ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4531ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4532ed502f03SStefano Zampini 45339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4534ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4535ed502f03SStefano Zampini if (AT) { 4536ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4537ed502f03SStefano Zampini thrust::advance(rT,-1); 4538ed502f03SStefano Zampini } 4539ed502f03SStefano Zampini if (BT) { 4540ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4541ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4542ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4543ed502f03SStefano Zampini } 4544ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4545ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4546ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4547ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4548ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4549ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 45509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4551ed502f03SStefano Zampini 45529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 45539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 45549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 45559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 45569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 45579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 45589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4561ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4562ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4563ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4564ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 45659566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4566ed502f03SStefano Zampini #endif 4567ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4568ed502f03SStefano Zampini } 4569ed502f03SStefano Zampini } 4570ed502f03SStefano Zampini 4571ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4572ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4573ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 45749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 45759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 4576ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4577ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4578ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4579ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4580ed502f03SStefano Zampini jj = *Ccsr->column_indices; 45819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45829566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4583ed502f03SStefano Zampini } else { 45849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4586ed502f03SStefano Zampini } 45879566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 45889566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 45899566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 4590ed502f03SStefano Zampini c->maxnz = c->nz; 4591ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4592ed502f03SStefano Zampini c->rmax = 0; 4593ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4594ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4595ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4596ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4597ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4598ed502f03SStefano Zampini } 45999566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 46009566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 4601ed502f03SStefano Zampini (*C)->nonzerostate++; 46029566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 46039566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4604ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4605ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4606ed502f03SStefano Zampini } else { 460708401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4608ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4609ed502f03SStefano Zampini if (c->nz) { 4610ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 46115f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 46122c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 461308401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 46149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 46159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 46165f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 46175f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4618ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4619ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4620ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 46212c71b3e2SJacob Faibussowitsch PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 46222c71b3e2SJacob Faibussowitsch PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 46232c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 46242c71b3e2SJacob Faibussowitsch PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 46255f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4626ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4627ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 46289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4629ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4630ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4631ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4632ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4633ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4634ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4635ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4636ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4637ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4638ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 46399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 46401a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 46415f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4642ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4643ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4644ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4645ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4646ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4647ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4648ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 46491a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4650ed502f03SStefano Zampini } 46519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4652ed502f03SStefano Zampini } 4653ed502f03SStefano Zampini } 46549566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4655ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4656ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4657ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4658ed502f03SStefano Zampini PetscFunctionReturn(0); 4659ed502f03SStefano Zampini } 4660c215019aSStefano Zampini 4661c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4662c215019aSStefano Zampini { 4663c215019aSStefano Zampini bool dmem; 4664c215019aSStefano Zampini const PetscScalar *av; 4665c215019aSStefano Zampini 4666c215019aSStefano Zampini PetscFunctionBegin; 4667c215019aSStefano Zampini dmem = isCudaMem(v); 46689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4669c215019aSStefano Zampini if (n && idx) { 4670c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4671c215019aSStefano Zampini widx.assign(idx,idx+n); 46729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4673c215019aSStefano Zampini 4674c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4675c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4676c215019aSStefano Zampini if (dmem) { 4677c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4678c215019aSStefano Zampini } else { 4679c215019aSStefano Zampini w = new THRUSTARRAY(n); 4680c215019aSStefano Zampini dv = w->data(); 4681c215019aSStefano Zampini } 4682c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4683c215019aSStefano Zampini 4684c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4685c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4686c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4687c215019aSStefano Zampini if (w) { 46889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4689c215019aSStefano Zampini } 4690c215019aSStefano Zampini delete w; 4691c215019aSStefano Zampini } else { 46929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4693c215019aSStefano Zampini } 46949566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 46959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4696c215019aSStefano Zampini PetscFunctionReturn(0); 4697c215019aSStefano Zampini } 4698