19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3bc3f50f2SPaul Mullowney matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 189ae82921SPaul Mullowney 19087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22087f3262SPaul Mullowney 236fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 246fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 256fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26087f3262SPaul Mullowney 276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 286fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 306fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 314416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 326fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 336fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 346fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 356fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36*e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 37*e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38*e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 399ae82921SPaul Mullowney 407f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 41470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 42470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 43ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 44470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 45470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 467f756511SDominic Meiser 47b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 48b06137fdSPaul Mullowney { 49b06137fdSPaul Mullowney cusparseStatus_t stat; 50b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 51b06137fdSPaul Mullowney 52b06137fdSPaul Mullowney PetscFunctionBegin; 53b06137fdSPaul Mullowney cusparsestruct->stream = stream; 5457d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 55b06137fdSPaul Mullowney PetscFunctionReturn(0); 56b06137fdSPaul Mullowney } 57b06137fdSPaul Mullowney 58b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 59b06137fdSPaul Mullowney { 60b06137fdSPaul Mullowney cusparseStatus_t stat; 61b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 62b06137fdSPaul Mullowney 63b06137fdSPaul Mullowney PetscFunctionBegin; 646b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 6516a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 6657d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 6716a2e217SAlejandro Lamas Daviña } 68b06137fdSPaul Mullowney cusparsestruct->handle = handle; 696b1cf21dSAlejandro Lamas Daviña } 7057d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 71b06137fdSPaul Mullowney PetscFunctionReturn(0); 72b06137fdSPaul Mullowney } 73b06137fdSPaul Mullowney 74b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 75b06137fdSPaul Mullowney { 76b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 77ccdfe979SStefano Zampini 78b06137fdSPaul Mullowney PetscFunctionBegin; 79ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 80b06137fdSPaul Mullowney PetscFunctionReturn(0); 81b06137fdSPaul Mullowney } 82b06137fdSPaul Mullowney 83ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 849ae82921SPaul Mullowney { 859ae82921SPaul Mullowney PetscFunctionBegin; 869ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 879ae82921SPaul Mullowney PetscFunctionReturn(0); 889ae82921SPaul Mullowney } 899ae82921SPaul Mullowney 90c708e6cdSJed Brown /*MC 91087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 97c708e6cdSJed Brown 989ae82921SPaul Mullowney Level: beginner 99c708e6cdSJed Brown 1003ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101c708e6cdSJed Brown M*/ 1029ae82921SPaul Mullowney 10342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1049ae82921SPaul Mullowney { 1059ae82921SPaul Mullowney PetscErrorCode ierr; 106bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1079ae82921SPaul Mullowney 1089ae82921SPaul Mullowney PetscFunctionBegin; 109bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110404133a2SPaul Mullowney (*B)->factortype = ftype; 111bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1129ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1132205254eSKarl Rupp 114087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 11533d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1169ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1179ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1219ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122bc3f50f2SPaul Mullowney 123fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1243ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1259ae82921SPaul Mullowney PetscFunctionReturn(0); 1269ae82921SPaul Mullowney } 1279ae82921SPaul Mullowney 128bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 129ca45077fSPaul Mullowney { 130aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1316e111a19SKarl Rupp 132ca45077fSPaul Mullowney PetscFunctionBegin; 133ca45077fSPaul Mullowney switch (op) { 134e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 135aa372e3fSPaul Mullowney cusparsestruct->format = format; 136ca45077fSPaul Mullowney break; 137e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 138aa372e3fSPaul Mullowney cusparsestruct->format = format; 139ca45077fSPaul Mullowney break; 140ca45077fSPaul Mullowney default: 14136d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 142ca45077fSPaul Mullowney } 143ca45077fSPaul Mullowney PetscFunctionReturn(0); 144ca45077fSPaul Mullowney } 1459ae82921SPaul Mullowney 146e057df02SPaul Mullowney /*@ 147e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 148e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 149aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 150e057df02SPaul Mullowney Not Collective 151e057df02SPaul Mullowney 152e057df02SPaul Mullowney Input Parameters: 1538468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 15436d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1552692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 156e057df02SPaul Mullowney 157e057df02SPaul Mullowney Output Parameter: 158e057df02SPaul Mullowney 159e057df02SPaul Mullowney Level: intermediate 160e057df02SPaul Mullowney 1618468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 162e057df02SPaul Mullowney @*/ 163e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 164e057df02SPaul Mullowney { 165e057df02SPaul Mullowney PetscErrorCode ierr; 1666e111a19SKarl Rupp 167e057df02SPaul Mullowney PetscFunctionBegin; 168e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 170e057df02SPaul Mullowney PetscFunctionReturn(0); 171e057df02SPaul Mullowney } 172e057df02SPaul Mullowney 173*e6e9a74fSStefano Zampini /*@ 174*e6e9a74fSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 175*e6e9a74fSStefano Zampini 176*e6e9a74fSStefano Zampini Collective on mat 177*e6e9a74fSStefano Zampini 178*e6e9a74fSStefano Zampini Input Parameters: 179*e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 180*e6e9a74fSStefano Zampini - transgen - the boolean flag 181*e6e9a74fSStefano Zampini 182*e6e9a74fSStefano Zampini Level: intermediate 183*e6e9a74fSStefano Zampini 184*e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE 185*e6e9a74fSStefano Zampini @*/ 186*e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 187*e6e9a74fSStefano Zampini { 188*e6e9a74fSStefano Zampini PetscErrorCode ierr; 189*e6e9a74fSStefano Zampini PetscBool flg; 190*e6e9a74fSStefano Zampini 191*e6e9a74fSStefano Zampini PetscFunctionBegin; 192*e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 193*e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 194*e6e9a74fSStefano Zampini if (flg) { 195*e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 196*e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 197*e6e9a74fSStefano Zampini cusp->transgen = transgen; 198*e6e9a74fSStefano Zampini } 199*e6e9a74fSStefano Zampini PetscFunctionReturn(0); 200*e6e9a74fSStefano Zampini } 201*e6e9a74fSStefano Zampini 2024416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2039ae82921SPaul Mullowney { 2049ae82921SPaul Mullowney PetscErrorCode ierr; 205e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2069ae82921SPaul Mullowney PetscBool flg; 207a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2086e111a19SKarl Rupp 2099ae82921SPaul Mullowney PetscFunctionBegin; 210e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2119ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 212*e6e9a74fSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr); 213e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 214a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 215e057df02SPaul Mullowney if (flg) { 216e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 217045c96e1SPaul Mullowney } 2189ae82921SPaul Mullowney } 2194c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 220a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 2214c87dfd4SPaul Mullowney if (flg) { 2224c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 2234c87dfd4SPaul Mullowney } 2240af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2259ae82921SPaul Mullowney PetscFunctionReturn(0); 2269ae82921SPaul Mullowney } 2279ae82921SPaul Mullowney 2286fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2299ae82921SPaul Mullowney { 2309ae82921SPaul Mullowney PetscErrorCode ierr; 2319ae82921SPaul Mullowney 2329ae82921SPaul Mullowney PetscFunctionBegin; 2339ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2349ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2359ae82921SPaul Mullowney PetscFunctionReturn(0); 2369ae82921SPaul Mullowney } 2379ae82921SPaul Mullowney 2386fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2399ae82921SPaul Mullowney { 2409ae82921SPaul Mullowney PetscErrorCode ierr; 2419ae82921SPaul Mullowney 2429ae82921SPaul Mullowney PetscFunctionBegin; 2439ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2449ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2459ae82921SPaul Mullowney PetscFunctionReturn(0); 2469ae82921SPaul Mullowney } 2479ae82921SPaul Mullowney 248087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249087f3262SPaul Mullowney { 250087f3262SPaul Mullowney PetscErrorCode ierr; 251087f3262SPaul Mullowney 252087f3262SPaul Mullowney PetscFunctionBegin; 253087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255087f3262SPaul Mullowney PetscFunctionReturn(0); 256087f3262SPaul Mullowney } 257087f3262SPaul Mullowney 258087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 259087f3262SPaul Mullowney { 260087f3262SPaul Mullowney PetscErrorCode ierr; 261087f3262SPaul Mullowney 262087f3262SPaul Mullowney PetscFunctionBegin; 263087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 264087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 265087f3262SPaul Mullowney PetscFunctionReturn(0); 266087f3262SPaul Mullowney } 267087f3262SPaul Mullowney 268087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2699ae82921SPaul Mullowney { 2709ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2719ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2729ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 273aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 2749ae82921SPaul Mullowney cusparseStatus_t stat; 2759ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2769ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2779ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2789ae82921SPaul Mullowney PetscScalar *AALo; 2799ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 280b175d8bbSPaul Mullowney PetscErrorCode ierr; 28157d48284SJunchao Zhang cudaError_t cerr; 2829ae82921SPaul Mullowney 2839ae82921SPaul Mullowney PetscFunctionBegin; 284cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 285c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2869ae82921SPaul Mullowney try { 2879ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2889ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2899ae82921SPaul Mullowney 2909ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 29157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 29257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 29357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 2949ae82921SPaul Mullowney 2959ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2969ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2979ae82921SPaul Mullowney AiLo[n] = nzLower; 2989ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2999ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3009ae82921SPaul Mullowney v = aa; 3019ae82921SPaul Mullowney vi = aj; 3029ae82921SPaul Mullowney offset = 1; 3039ae82921SPaul Mullowney rowOffset= 1; 3049ae82921SPaul Mullowney for (i=1; i<n; i++) { 3059ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 306e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3079ae82921SPaul Mullowney AiLo[i] = rowOffset; 3089ae82921SPaul Mullowney rowOffset += nz+1; 3099ae82921SPaul Mullowney 310580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 311580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3129ae82921SPaul Mullowney 3139ae82921SPaul Mullowney offset += nz; 3149ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3159ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3169ae82921SPaul Mullowney offset += 1; 3179ae82921SPaul Mullowney 3189ae82921SPaul Mullowney v += nz; 3199ae82921SPaul Mullowney vi += nz; 3209ae82921SPaul Mullowney } 3212205254eSKarl Rupp 322aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 323aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3242205254eSKarl Rupp 325aa372e3fSPaul Mullowney /* Create the matrix description */ 32657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 32757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 32857d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 32957d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 33057d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 331aa372e3fSPaul Mullowney 332aa372e3fSPaul Mullowney /* Create the solve analysis information */ 33357d48284SJunchao Zhang stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 334aa372e3fSPaul Mullowney 335aa372e3fSPaul Mullowney /* set the operation */ 336aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 337aa372e3fSPaul Mullowney 338aa372e3fSPaul Mullowney /* set the matrix */ 339aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 340aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 341aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 342aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 343aa372e3fSPaul Mullowney 344aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 345aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 346aa372e3fSPaul Mullowney 347aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 348aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 349aa372e3fSPaul Mullowney 350aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 351aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 352aa372e3fSPaul Mullowney 353aa372e3fSPaul Mullowney /* perform the solve analysis */ 354aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 355aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 35757d48284SJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 358aa372e3fSPaul Mullowney 359aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 360aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 3612205254eSKarl Rupp 36257d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 36357d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 36457d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 3654863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 3669ae82921SPaul Mullowney } catch(char *ex) { 3679ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3689ae82921SPaul Mullowney } 3699ae82921SPaul Mullowney } 3709ae82921SPaul Mullowney PetscFunctionReturn(0); 3719ae82921SPaul Mullowney } 3729ae82921SPaul Mullowney 373087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 3749ae82921SPaul Mullowney { 3759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3769ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3779ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 3799ae82921SPaul Mullowney cusparseStatus_t stat; 3809ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 3819ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3829ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3839ae82921SPaul Mullowney PetscScalar *AAUp; 3849ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 3859ae82921SPaul Mullowney PetscErrorCode ierr; 38657d48284SJunchao Zhang cudaError_t cerr; 3879ae82921SPaul Mullowney 3889ae82921SPaul Mullowney PetscFunctionBegin; 389cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 390c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3919ae82921SPaul Mullowney try { 3929ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3939ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 39657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 39757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 39857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 3999ae82921SPaul Mullowney 4009ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4019ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4029ae82921SPaul Mullowney AiUp[n]=nzUpper; 4039ae82921SPaul Mullowney offset = nzUpper; 4049ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4059ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4069ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4079ae82921SPaul Mullowney 408e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4099ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4109ae82921SPaul Mullowney 411e057df02SPaul Mullowney /* decrement the offset */ 4129ae82921SPaul Mullowney offset -= (nz+1); 4139ae82921SPaul Mullowney 414e057df02SPaul Mullowney /* first, set the diagonal elements */ 4159ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 41609f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 4179ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4189ae82921SPaul Mullowney 419580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 420580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 4219ae82921SPaul Mullowney } 4222205254eSKarl Rupp 423aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 424aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 4252205254eSKarl Rupp 426aa372e3fSPaul Mullowney /* Create the matrix description */ 42757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 42857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 42957d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 43057d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 43157d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 432aa372e3fSPaul Mullowney 433aa372e3fSPaul Mullowney /* Create the solve analysis information */ 43457d48284SJunchao Zhang stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 435aa372e3fSPaul Mullowney 436aa372e3fSPaul Mullowney /* set the operation */ 437aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438aa372e3fSPaul Mullowney 439aa372e3fSPaul Mullowney /* set the matrix */ 440aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 441aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 442aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 443aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 444aa372e3fSPaul Mullowney 445aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 447aa372e3fSPaul Mullowney 448aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 449aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 450aa372e3fSPaul Mullowney 451aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 452aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 453aa372e3fSPaul Mullowney 454aa372e3fSPaul Mullowney /* perform the solve analysis */ 455aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 456aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 457aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 45857d48284SJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459aa372e3fSPaul Mullowney 460aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 461aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 4622205254eSKarl Rupp 46357d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 46457d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 46557d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 4664863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 4679ae82921SPaul Mullowney } catch(char *ex) { 4689ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4699ae82921SPaul Mullowney } 4709ae82921SPaul Mullowney } 4719ae82921SPaul Mullowney PetscFunctionReturn(0); 4729ae82921SPaul Mullowney } 4739ae82921SPaul Mullowney 474087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 4759ae82921SPaul Mullowney { 4769ae82921SPaul Mullowney PetscErrorCode ierr; 4779ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4789ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4799ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 4809ae82921SPaul Mullowney PetscBool row_identity,col_identity; 4819ae82921SPaul Mullowney const PetscInt *r,*c; 4829ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4839ae82921SPaul Mullowney 4849ae82921SPaul Mullowney PetscFunctionBegin; 485ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 486087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 487087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 4882205254eSKarl Rupp 489e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 490aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 4919ae82921SPaul Mullowney 492c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 493e057df02SPaul Mullowney /* lower triangular indices */ 4949ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4959ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4962205254eSKarl Rupp if (!row_identity) { 497aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 498aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 4992205254eSKarl Rupp } 5009ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5019ae82921SPaul Mullowney 502e057df02SPaul Mullowney /* upper triangular indices */ 5039ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 5049ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5052205254eSKarl Rupp if (!col_identity) { 506aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 507aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 5082205254eSKarl Rupp } 5094863603aSSatish Balay 5104863603aSSatish Balay if (!row_identity && !col_identity) { 5114863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 5124863603aSSatish Balay } else if(!row_identity) { 5134863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 5144863603aSSatish Balay } else if(!col_identity) { 5154863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 5164863603aSSatish Balay } 5174863603aSSatish Balay 5189ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 5199ae82921SPaul Mullowney PetscFunctionReturn(0); 5209ae82921SPaul Mullowney } 5219ae82921SPaul Mullowney 522087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 523087f3262SPaul Mullowney { 524087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 525087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 526aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 527aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 528087f3262SPaul Mullowney cusparseStatus_t stat; 529087f3262SPaul Mullowney PetscErrorCode ierr; 53057d48284SJunchao Zhang cudaError_t cerr; 531087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 532087f3262SPaul Mullowney PetscScalar *AAUp; 533087f3262SPaul Mullowney PetscScalar *AALo; 534087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 535087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 536087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 537087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 538087f3262SPaul Mullowney 539087f3262SPaul Mullowney PetscFunctionBegin; 540cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 541c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 542087f3262SPaul Mullowney try { 543087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 54457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 54557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 54657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 54757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 548087f3262SPaul Mullowney 549087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 550087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 551087f3262SPaul Mullowney AiUp[n]=nzUpper; 552087f3262SPaul Mullowney offset = 0; 553087f3262SPaul Mullowney for (i=0; i<n; i++) { 554087f3262SPaul Mullowney /* set the pointers */ 555087f3262SPaul Mullowney v = aa + ai[i]; 556087f3262SPaul Mullowney vj = aj + ai[i]; 557087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 558087f3262SPaul Mullowney 559087f3262SPaul Mullowney /* first, set the diagonal elements */ 560087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 56109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 562087f3262SPaul Mullowney AiUp[i] = offset; 56309f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 564087f3262SPaul Mullowney 565087f3262SPaul Mullowney offset+=1; 566087f3262SPaul Mullowney if (nz>0) { 567f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 568580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 569087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 570087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 571087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 572087f3262SPaul Mullowney } 573087f3262SPaul Mullowney offset+=nz; 574087f3262SPaul Mullowney } 575087f3262SPaul Mullowney } 576087f3262SPaul Mullowney 577aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 578aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 579087f3262SPaul Mullowney 580aa372e3fSPaul Mullowney /* Create the matrix description */ 58157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 58257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 58357d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 58457d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 58557d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 586087f3262SPaul Mullowney 587aa372e3fSPaul Mullowney /* Create the solve analysis information */ 58857d48284SJunchao Zhang stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 589aa372e3fSPaul Mullowney 590aa372e3fSPaul Mullowney /* set the operation */ 591aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 592aa372e3fSPaul Mullowney 593aa372e3fSPaul Mullowney /* set the matrix */ 594aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 595aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 596aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 597aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 598aa372e3fSPaul Mullowney 599aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 600aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 601aa372e3fSPaul Mullowney 602aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 603aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 604aa372e3fSPaul Mullowney 605aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 606aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 607aa372e3fSPaul Mullowney 608aa372e3fSPaul Mullowney /* perform the solve analysis */ 609aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 61257d48284SJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 613aa372e3fSPaul Mullowney 614aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 615aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 616aa372e3fSPaul Mullowney 617aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 618aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 619aa372e3fSPaul Mullowney 620aa372e3fSPaul Mullowney /* Create the matrix description */ 62157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 62257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 62357d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 62457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 62557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 626aa372e3fSPaul Mullowney 627aa372e3fSPaul Mullowney /* Create the solve analysis information */ 62857d48284SJunchao Zhang stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 629aa372e3fSPaul Mullowney 630aa372e3fSPaul Mullowney /* set the operation */ 631aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 632aa372e3fSPaul Mullowney 633aa372e3fSPaul Mullowney /* set the matrix */ 634aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 635aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 636aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 637aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 638aa372e3fSPaul Mullowney 639aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 640aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 641aa372e3fSPaul Mullowney 642aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 643aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 644aa372e3fSPaul Mullowney 645aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 646aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 6474863603aSSatish Balay ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 648aa372e3fSPaul Mullowney 649aa372e3fSPaul Mullowney /* perform the solve analysis */ 650aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 651aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 652aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 65357d48284SJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 654aa372e3fSPaul Mullowney 655aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 656aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 657087f3262SPaul Mullowney 658c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 65957d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 66057d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 66157d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 66257d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 663087f3262SPaul Mullowney } catch(char *ex) { 664087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 665087f3262SPaul Mullowney } 666087f3262SPaul Mullowney } 667087f3262SPaul Mullowney PetscFunctionReturn(0); 668087f3262SPaul Mullowney } 669087f3262SPaul Mullowney 670087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 6719ae82921SPaul Mullowney { 6729ae82921SPaul Mullowney PetscErrorCode ierr; 673087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 674087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 675087f3262SPaul Mullowney IS ip = a->row; 676087f3262SPaul Mullowney const PetscInt *rip; 677087f3262SPaul Mullowney PetscBool perm_identity; 678087f3262SPaul Mullowney PetscInt n = A->rmap->n; 679087f3262SPaul Mullowney 680087f3262SPaul Mullowney PetscFunctionBegin; 681ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 682087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 683e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 684aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 685aa372e3fSPaul Mullowney 686087f3262SPaul Mullowney /* lower triangular indices */ 687087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 688087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 689087f3262SPaul Mullowney if (!perm_identity) { 6904e4bbfaaSStefano Zampini IS iip; 6914e4bbfaaSStefano Zampini const PetscInt *irip; 6924e4bbfaaSStefano Zampini 6934e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 6944e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 695aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 696aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 697aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 6984e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 6994e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 7004e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 7014863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 702087f3262SPaul Mullowney } 703087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 704087f3262SPaul Mullowney PetscFunctionReturn(0); 705087f3262SPaul Mullowney } 706087f3262SPaul Mullowney 7076fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 7089ae82921SPaul Mullowney { 7099ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 7109ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 7119ae82921SPaul Mullowney PetscBool row_identity,col_identity; 712b175d8bbSPaul Mullowney PetscErrorCode ierr; 7139ae82921SPaul Mullowney 7149ae82921SPaul Mullowney PetscFunctionBegin; 7159ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 716ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 717e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 7189ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 7199ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 720bda325fcSPaul Mullowney if (row_identity && col_identity) { 721bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 722bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 7234e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 7244e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 725bda325fcSPaul Mullowney } else { 726bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 7284e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 7294e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 730bda325fcSPaul Mullowney } 7318dc1d2a3SPaul Mullowney 732e057df02SPaul Mullowney /* get the triangular factors */ 733087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 7349ae82921SPaul Mullowney PetscFunctionReturn(0); 7359ae82921SPaul Mullowney } 7369ae82921SPaul Mullowney 737087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 738087f3262SPaul Mullowney { 739087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 740087f3262SPaul Mullowney IS ip = b->row; 741087f3262SPaul Mullowney PetscBool perm_identity; 742b175d8bbSPaul Mullowney PetscErrorCode ierr; 743087f3262SPaul Mullowney 744087f3262SPaul Mullowney PetscFunctionBegin; 745087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 746ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 747087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 748087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 749087f3262SPaul Mullowney if (perm_identity) { 750087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 751087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 7524e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 7534e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 754087f3262SPaul Mullowney } else { 755087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 756087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 7574e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 7584e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 759087f3262SPaul Mullowney } 760087f3262SPaul Mullowney 761087f3262SPaul Mullowney /* get the triangular factors */ 762087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 763087f3262SPaul Mullowney PetscFunctionReturn(0); 764087f3262SPaul Mullowney } 7659ae82921SPaul Mullowney 766b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 767bda325fcSPaul Mullowney { 768bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 770aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 771aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 772aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 773bda325fcSPaul Mullowney cusparseStatus_t stat; 774aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 775aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 776aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 777aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 778b175d8bbSPaul Mullowney 779bda325fcSPaul Mullowney PetscFunctionBegin; 780bda325fcSPaul Mullowney 781aa372e3fSPaul Mullowney /*********************************************/ 782aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 783aa372e3fSPaul Mullowney /*********************************************/ 784aa372e3fSPaul Mullowney 785aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 786aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 787aa372e3fSPaul Mullowney 788aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 789aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 790aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 791aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 792aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 793aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 794aa372e3fSPaul Mullowney 795aa372e3fSPaul Mullowney /* Create the matrix description */ 79657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 79757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 79857d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 79957d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 80057d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 801aa372e3fSPaul Mullowney 802aa372e3fSPaul Mullowney /* Create the solve analysis information */ 80357d48284SJunchao Zhang stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 804aa372e3fSPaul Mullowney 805aa372e3fSPaul Mullowney /* set the operation */ 806aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 807aa372e3fSPaul Mullowney 808aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 809aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 810aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 811aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 812aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 813aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 814aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 815aa372e3fSPaul Mullowney loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 816aa372e3fSPaul Mullowney 817aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 818aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 819aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 820aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 821aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 822aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 823aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 824aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 825aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 82657d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 827aa372e3fSPaul Mullowney 828aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 829aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 830aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 831aa372e3fSPaul Mullowney loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 832aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 83357d48284SJunchao Zhang loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 834aa372e3fSPaul Mullowney 835aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 836aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 837aa372e3fSPaul Mullowney 838aa372e3fSPaul Mullowney /*********************************************/ 839aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 840aa372e3fSPaul Mullowney /*********************************************/ 841aa372e3fSPaul Mullowney 842aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 843aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 846aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 847aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 848aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 849aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 850aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 851aa372e3fSPaul Mullowney 852aa372e3fSPaul Mullowney /* Create the matrix description */ 85357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 85457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 85557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 85657d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 85757d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 858aa372e3fSPaul Mullowney 859aa372e3fSPaul Mullowney /* Create the solve analysis information */ 86057d48284SJunchao Zhang stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 861aa372e3fSPaul Mullowney 862aa372e3fSPaul Mullowney /* set the operation */ 863aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 864aa372e3fSPaul Mullowney 865aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 866aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 867aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 868aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 869aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 870aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 871aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 872aa372e3fSPaul Mullowney upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 873aa372e3fSPaul Mullowney 874aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 875aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 876aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 877aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 878aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 879aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 880aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 881aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 882aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 88357d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 884aa372e3fSPaul Mullowney 885aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 886aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 887aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 888aa372e3fSPaul Mullowney upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 889aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 89057d48284SJunchao Zhang upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 891aa372e3fSPaul Mullowney 892aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 893aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 894bda325fcSPaul Mullowney PetscFunctionReturn(0); 895bda325fcSPaul Mullowney } 896bda325fcSPaul Mullowney 897b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 898bda325fcSPaul Mullowney { 899aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 900aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 901aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 902bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 903bda325fcSPaul Mullowney cusparseStatus_t stat; 904aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 905b06137fdSPaul Mullowney cudaError_t err; 9064863603aSSatish Balay PetscErrorCode ierr; 907b175d8bbSPaul Mullowney 908bda325fcSPaul Mullowney PetscFunctionBegin; 909*e6e9a74fSStefano Zampini if (!cusparsestruct->transgen) PetscFunctionReturn(0); 910aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 911aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 91257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 913aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 91457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 91557d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 916aa372e3fSPaul Mullowney 917b06137fdSPaul Mullowney /* set alpha and beta */ 918c41cb2e2SAlejandro Lamas Daviña err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 9197656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 9207656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 9217656d835SStefano Zampini err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 9227656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 9237656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 92457d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 925b06137fdSPaul Mullowney 926aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 927aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 928aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 929554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 930554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 931aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 932a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 933aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 934aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 935a3fdcf43SKarl Rupp 93681902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 93781902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 93881902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 939aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 940a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 941a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 942aa372e3fSPaul Mullowney matrix->values->data().get(), 94381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 944aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 945aa372e3fSPaul Mullowney matrixT->values->data().get(), 946aa372e3fSPaul Mullowney matrixT->column_indices->data().get(), 947aa372e3fSPaul Mullowney matrixT->row_offsets->data().get(), 94857d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 949aa372e3fSPaul Mullowney /* assign the pointer */ 950aa372e3fSPaul Mullowney matstructT->mat = matrixT; 9514863603aSSatish Balay ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 952aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 953aa372e3fSPaul Mullowney /* First convert HYB to CSR */ 954aa372e3fSPaul Mullowney CsrMatrix *temp= new CsrMatrix; 955aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 956aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 957aa372e3fSPaul Mullowney temp->num_entries = a->nz; 958aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 959aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 960aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 961aa372e3fSPaul Mullowney 9622692e278SPaul Mullowney 963aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 964aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 965aa372e3fSPaul Mullowney temp->values->data().get(), 966aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 96757d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 968aa372e3fSPaul Mullowney 969aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 970aa372e3fSPaul Mullowney CsrMatrix *tempT= new CsrMatrix; 971aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 972aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 973aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 974aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 975aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 976aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 977aa372e3fSPaul Mullowney 978aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 979aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 980aa372e3fSPaul Mullowney temp->values->data().get(), 981aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 982aa372e3fSPaul Mullowney temp->column_indices->data().get(), 983aa372e3fSPaul Mullowney tempT->values->data().get(), 984aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 985aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 98657d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 987aa372e3fSPaul Mullowney 988aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 989aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 99057d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 991aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 992aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 993aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 994aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 995aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 996aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 99757d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 998aa372e3fSPaul Mullowney 999aa372e3fSPaul Mullowney /* assign the pointer */ 1000aa372e3fSPaul Mullowney matstructT->mat = hybMat; 10014863603aSSatish Balay ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 1002aa372e3fSPaul Mullowney 1003aa372e3fSPaul Mullowney /* delete temporaries */ 1004aa372e3fSPaul Mullowney if (tempT) { 1005aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1006aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1007aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1008aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1009087f3262SPaul Mullowney } 1010aa372e3fSPaul Mullowney if (temp) { 1011aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1012aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1013aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1014aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1015aa372e3fSPaul Mullowney } 1016aa372e3fSPaul Mullowney } 1017213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1018213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1019aa372e3fSPaul Mullowney /* assign the pointer */ 1020aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1021bda325fcSPaul Mullowney PetscFunctionReturn(0); 1022bda325fcSPaul Mullowney } 1023bda325fcSPaul Mullowney 10244e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 10256fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1026bda325fcSPaul Mullowney { 1027c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1028465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1029465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1030465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1031465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1032bda325fcSPaul Mullowney cusparseStatus_t stat; 1033bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1034aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1035aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1036aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1037b175d8bbSPaul Mullowney PetscErrorCode ierr; 103857d48284SJunchao Zhang cudaError_t cerr; 1039bda325fcSPaul Mullowney 1040bda325fcSPaul Mullowney PetscFunctionBegin; 1041aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1042aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1043bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1044aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1045aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1046bda325fcSPaul Mullowney } 1047bda325fcSPaul Mullowney 1048bda325fcSPaul Mullowney /* Get the GPU pointers */ 1049c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1050c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1051c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1052c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1053bda325fcSPaul Mullowney 10547a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1055aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1056c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1057c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1058c41cb2e2SAlejandro Lamas Daviña xGPU); 1059aa372e3fSPaul Mullowney 1060aa372e3fSPaul Mullowney /* First, solve U */ 1061aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 10627656d835SStefano Zampini upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1063aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1064aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1065aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1066aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 106757d48284SJunchao Zhang xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1068aa372e3fSPaul Mullowney 1069aa372e3fSPaul Mullowney /* Then, solve L */ 1070aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 10717656d835SStefano Zampini loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1072aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1073aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1074aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1075aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 107657d48284SJunchao Zhang tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1077aa372e3fSPaul Mullowney 1078aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1079c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1080c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1081aa372e3fSPaul Mullowney tempGPU->begin()); 1082aa372e3fSPaul Mullowney 1083aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1084c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1085bda325fcSPaul Mullowney 1086bda325fcSPaul Mullowney /* restore */ 1087c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1088c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 108957d48284SJunchao Zhang cerr = WaitForGPU();CHKERRCUDA(cerr); 1090661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1091958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1092bda325fcSPaul Mullowney PetscFunctionReturn(0); 1093bda325fcSPaul Mullowney } 1094bda325fcSPaul Mullowney 10956fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1096bda325fcSPaul Mullowney { 1097465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1098465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1099bda325fcSPaul Mullowney cusparseStatus_t stat; 1100bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1101aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1102aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1103aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1104b175d8bbSPaul Mullowney PetscErrorCode ierr; 110557d48284SJunchao Zhang cudaError_t cerr; 1106bda325fcSPaul Mullowney 1107bda325fcSPaul Mullowney PetscFunctionBegin; 1108aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1109aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1110bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1111aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1112aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1113bda325fcSPaul Mullowney } 1114bda325fcSPaul Mullowney 1115bda325fcSPaul Mullowney /* Get the GPU pointers */ 1116c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1117c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1118bda325fcSPaul Mullowney 11197a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1120aa372e3fSPaul Mullowney /* First, solve U */ 1121aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 11227656d835SStefano Zampini upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1123aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1124aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1125aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1126aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 112757d48284SJunchao Zhang barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1128aa372e3fSPaul Mullowney 1129aa372e3fSPaul Mullowney /* Then, solve L */ 1130aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 11317656d835SStefano Zampini loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1132aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1133aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1134aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1135aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 113657d48284SJunchao Zhang tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1137bda325fcSPaul Mullowney 1138bda325fcSPaul Mullowney /* restore */ 1139c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1140c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 114157d48284SJunchao Zhang cerr = WaitForGPU();CHKERRCUDA(cerr); 1142661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1143958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1144bda325fcSPaul Mullowney PetscFunctionReturn(0); 1145bda325fcSPaul Mullowney } 1146bda325fcSPaul Mullowney 11476fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 11489ae82921SPaul Mullowney { 1149465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1150465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1151465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1152465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 11539ae82921SPaul Mullowney cusparseStatus_t stat; 11549ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1155aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1156aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1157aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1158b175d8bbSPaul Mullowney PetscErrorCode ierr; 115957d48284SJunchao Zhang cudaError_t cerr; 11609ae82921SPaul Mullowney 11619ae82921SPaul Mullowney PetscFunctionBegin; 1162ebc8f436SDominic Meiser 1163e057df02SPaul Mullowney /* Get the GPU pointers */ 1164c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1165c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1166c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1167c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 11689ae82921SPaul Mullowney 11697a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1170aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1171c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1172c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 11734e4bbfaaSStefano Zampini tempGPU->begin()); 1174aa372e3fSPaul Mullowney 1175aa372e3fSPaul Mullowney /* Next, solve L */ 1176aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 11777656d835SStefano Zampini loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1178aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1179aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1180aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1181aa372e3fSPaul Mullowney loTriFactor->solveInfo, 118257d48284SJunchao Zhang tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1183aa372e3fSPaul Mullowney 1184aa372e3fSPaul Mullowney /* Then, solve U */ 1185aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 11867656d835SStefano Zampini upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1187aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1188aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1189aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1190aa372e3fSPaul Mullowney upTriFactor->solveInfo, 119157d48284SJunchao Zhang xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1192aa372e3fSPaul Mullowney 11934e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 11944e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 11954e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 11964e4bbfaaSStefano Zampini xGPU); 11979ae82921SPaul Mullowney 1198c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1199c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 120057d48284SJunchao Zhang cerr = WaitForGPU();CHKERRCUDA(cerr); 1201661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1202958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 12039ae82921SPaul Mullowney PetscFunctionReturn(0); 12049ae82921SPaul Mullowney } 12059ae82921SPaul Mullowney 12066fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 12079ae82921SPaul Mullowney { 1208465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1209465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 12109ae82921SPaul Mullowney cusparseStatus_t stat; 12119ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1212aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1213aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1214aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1215b175d8bbSPaul Mullowney PetscErrorCode ierr; 121657d48284SJunchao Zhang cudaError_t cerr; 12179ae82921SPaul Mullowney 12189ae82921SPaul Mullowney PetscFunctionBegin; 1219e057df02SPaul Mullowney /* Get the GPU pointers */ 1220c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1221c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 12229ae82921SPaul Mullowney 12237a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1224aa372e3fSPaul Mullowney /* First, solve L */ 1225aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 12267656d835SStefano Zampini loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1227aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1228aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1229aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1230aa372e3fSPaul Mullowney loTriFactor->solveInfo, 123157d48284SJunchao Zhang barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1232aa372e3fSPaul Mullowney 1233aa372e3fSPaul Mullowney /* Next, solve U */ 1234aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 12357656d835SStefano Zampini upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1236aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1237aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1238aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1239aa372e3fSPaul Mullowney upTriFactor->solveInfo, 124057d48284SJunchao Zhang tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 12419ae82921SPaul Mullowney 1242c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1243c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 124457d48284SJunchao Zhang cerr = WaitForGPU();CHKERRCUDA(cerr); 1245661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1246958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 12479ae82921SPaul Mullowney PetscFunctionReturn(0); 12489ae82921SPaul Mullowney } 12499ae82921SPaul Mullowney 12506fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 12519ae82921SPaul Mullowney { 1252aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 12537c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 12549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1255213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 12569ae82921SPaul Mullowney PetscErrorCode ierr; 1257aa372e3fSPaul Mullowney cusparseStatus_t stat; 1258b06137fdSPaul Mullowney cudaError_t err; 12599ae82921SPaul Mullowney 12609ae82921SPaul Mullowney PetscFunctionBegin; 126195639643SRichard Tran Mills if (A->boundtocpu) PetscFunctionReturn(0); 1262c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 12639ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 126481902715SJunchao Zhang if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 126581902715SJunchao Zhang /* Copy values only */ 126681902715SJunchao Zhang CsrMatrix *mat,*matT; 126781902715SJunchao Zhang mat = (CsrMatrix*)cusparsestruct->mat->mat; 126881902715SJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 12694863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 127081902715SJunchao Zhang 127181902715SJunchao Zhang /* Update matT when it was built before */ 127281902715SJunchao Zhang if (cusparsestruct->matTranspose) { 127381902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 127481902715SJunchao Zhang matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 127581902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 127681902715SJunchao Zhang A->cmap->n, mat->num_entries, 127781902715SJunchao Zhang mat->values->data().get(), 127881902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 127981902715SJunchao Zhang mat->column_indices->data().get(), 128081902715SJunchao Zhang matT->values->data().get(), 128181902715SJunchao Zhang matT->column_indices->data().get(), 128281902715SJunchao Zhang matT->row_offsets->data().get(), 12839f74ec24SSatish Balay CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 128481902715SJunchao Zhang } 128534d6c7a5SJose E. Roman } else { 12867c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 12877c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 12887c700b8dSJunchao Zhang delete cusparsestruct->workVector; 128981902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 12909ae82921SPaul Mullowney try { 12919ae82921SPaul Mullowney if (a->compressedrow.use) { 12929ae82921SPaul Mullowney m = a->compressedrow.nrows; 12939ae82921SPaul Mullowney ii = a->compressedrow.i; 12949ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 12959ae82921SPaul Mullowney } else { 1296213423ffSJunchao Zhang m = A->rmap->n; 1297213423ffSJunchao Zhang ii = a->i; 1298*e6e9a74fSStefano Zampini ridx = NULL; 12999ae82921SPaul Mullowney } 1300213423ffSJunchao Zhang cusparsestruct->nrows = m; 13019ae82921SPaul Mullowney 1302aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 1303aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 130457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 130557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 130657d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 13079ae82921SPaul Mullowney 1308c41cb2e2SAlejandro Lamas Daviña err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 13097656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 13107656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 13117656d835SStefano Zampini err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 13127656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 13137656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 131457d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1315b06137fdSPaul Mullowney 1316aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1317aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1318aa372e3fSPaul Mullowney /* set the matrix */ 1319aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1320a65300a6SPaul Mullowney matrix->num_rows = m; 1321aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1322aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1323a65300a6SPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1324a65300a6SPaul Mullowney matrix->row_offsets->assign(ii, ii + m+1); 13259ae82921SPaul Mullowney 1326aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1327aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1328aa372e3fSPaul Mullowney 1329aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1330aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1331aa372e3fSPaul Mullowney 1332aa372e3fSPaul Mullowney /* assign the pointer */ 1333aa372e3fSPaul Mullowney matstruct->mat = matrix; 1334aa372e3fSPaul Mullowney 1335aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1336aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1337a65300a6SPaul Mullowney matrix->num_rows = m; 1338aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1339aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1340a65300a6SPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1341a65300a6SPaul Mullowney matrix->row_offsets->assign(ii, ii + m+1); 1342aa372e3fSPaul Mullowney 1343aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1344aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1345aa372e3fSPaul Mullowney 1346aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1347aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1348aa372e3fSPaul Mullowney 1349aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 135057d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1351aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1352aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1353a65300a6SPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1354aa372e3fSPaul Mullowney matstruct->descr, matrix->values->data().get(), 1355aa372e3fSPaul Mullowney matrix->row_offsets->data().get(), 1356aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 135757d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1358aa372e3fSPaul Mullowney /* assign the pointer */ 1359aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1360aa372e3fSPaul Mullowney 1361aa372e3fSPaul Mullowney if (matrix) { 1362aa372e3fSPaul Mullowney if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1363aa372e3fSPaul Mullowney if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1364aa372e3fSPaul Mullowney if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1365aa372e3fSPaul Mullowney delete (CsrMatrix*)matrix; 1366087f3262SPaul Mullowney } 1367087f3262SPaul Mullowney } 1368ca45077fSPaul Mullowney 1369aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1370213423ffSJunchao Zhang if (a->compressedrow.use) { 1371213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1372aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1373aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1374213423ffSJunchao Zhang tmp = m; 1375213423ffSJunchao Zhang } else { 1376213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1377213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1378213423ffSJunchao Zhang tmp = 0; 1379213423ffSJunchao Zhang } 1380213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1381aa372e3fSPaul Mullowney 1382aa372e3fSPaul Mullowney /* assign the pointer */ 1383aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 13849ae82921SPaul Mullowney } catch(char *ex) { 13859ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 13869ae82921SPaul Mullowney } 138734d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 138834d6c7a5SJose E. Roman } 138957d48284SJunchao Zhang err = WaitForGPU();CHKERRCUDA(err); 1390c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 13919ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 13929ae82921SPaul Mullowney } 13939ae82921SPaul Mullowney PetscFunctionReturn(0); 13949ae82921SPaul Mullowney } 13959ae82921SPaul Mullowney 1396c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1397aa372e3fSPaul Mullowney { 1398aa372e3fSPaul Mullowney template <typename Tuple> 1399aa372e3fSPaul Mullowney __host__ __device__ 1400aa372e3fSPaul Mullowney void operator()(Tuple t) 1401aa372e3fSPaul Mullowney { 1402aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1403aa372e3fSPaul Mullowney } 1404aa372e3fSPaul Mullowney }; 1405aa372e3fSPaul Mullowney 1406*e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1407*e6e9a74fSStefano Zampini { 1408*e6e9a74fSStefano Zampini template <typename Tuple> 1409*e6e9a74fSStefano Zampini __host__ __device__ 1410*e6e9a74fSStefano Zampini void operator()(Tuple t) 1411*e6e9a74fSStefano Zampini { 1412*e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1413*e6e9a74fSStefano Zampini } 1414*e6e9a74fSStefano Zampini }; 1415*e6e9a74fSStefano Zampini 1416ccdfe979SStefano Zampini typedef struct { 1417ccdfe979SStefano Zampini PetscBool cisdense; 1418ccdfe979SStefano Zampini PetscScalar *Bt; 1419ccdfe979SStefano Zampini Mat X; 1420ccdfe979SStefano Zampini } MatMatCusparse; 1421ccdfe979SStefano Zampini 1422ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1423ccdfe979SStefano Zampini { 1424ccdfe979SStefano Zampini PetscErrorCode ierr; 1425ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1426ccdfe979SStefano Zampini cudaError_t cerr; 1427ccdfe979SStefano Zampini 1428ccdfe979SStefano Zampini PetscFunctionBegin; 1429ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1430ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1431ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1432ccdfe979SStefano Zampini PetscFunctionReturn(0); 1433ccdfe979SStefano Zampini } 1434ccdfe979SStefano Zampini 1435ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1436ccdfe979SStefano Zampini 1437ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1438ccdfe979SStefano Zampini { 1439ccdfe979SStefano Zampini Mat_Product *product = C->product; 1440ccdfe979SStefano Zampini Mat A,B; 1441ccdfe979SStefano Zampini PetscInt m,n,k,blda,clda; 1442ccdfe979SStefano Zampini PetscBool flg,biscuda; 1443ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1444ccdfe979SStefano Zampini cusparseStatus_t stat; 1445ccdfe979SStefano Zampini cusparseOperation_t opA; 1446ccdfe979SStefano Zampini cusparseMatDescr_t matA; 1447ccdfe979SStefano Zampini const PetscScalar *barray; 1448ccdfe979SStefano Zampini PetscScalar *carray; 1449ccdfe979SStefano Zampini PetscErrorCode ierr; 1450ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1451ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1452ccdfe979SStefano Zampini CsrMatrix *csrmat; 1453c8378d12SStefano Zampini cudaError_t cuer; 1454ccdfe979SStefano Zampini 1455ccdfe979SStefano Zampini PetscFunctionBegin; 1456ccdfe979SStefano Zampini MatCheckProduct(C,1); 1457ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1458ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1459ccdfe979SStefano Zampini A = product->A; 1460ccdfe979SStefano Zampini B = product->B; 1461ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1462ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1463ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1464ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1465ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1466ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1467ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1468ccdfe979SStefano Zampini switch (product->type) { 1469ccdfe979SStefano Zampini case MATPRODUCT_AB: 1470ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1471ccdfe979SStefano Zampini mat = cusp->mat; 1472ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1473ccdfe979SStefano Zampini m = A->rmap->n; 1474ccdfe979SStefano Zampini k = A->cmap->n; 1475ccdfe979SStefano Zampini n = B->cmap->n; 1476ccdfe979SStefano Zampini break; 1477ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1478*e6e9a74fSStefano Zampini if (!cusp->transgen) { 1479*e6e9a74fSStefano Zampini mat = cusp->mat; 1480*e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1481*e6e9a74fSStefano Zampini } else { 1482ccdfe979SStefano Zampini if (!cusp->matTranspose) { 1483ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1484ccdfe979SStefano Zampini } 1485ccdfe979SStefano Zampini mat = cusp->matTranspose; 1486ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1487*e6e9a74fSStefano Zampini } 1488ccdfe979SStefano Zampini m = A->cmap->n; 1489ccdfe979SStefano Zampini k = A->rmap->n; 1490ccdfe979SStefano Zampini n = B->cmap->n; 1491ccdfe979SStefano Zampini break; 1492ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1493ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1494ccdfe979SStefano Zampini mat = cusp->mat; 1495ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1496ccdfe979SStefano Zampini m = A->rmap->n; 1497ccdfe979SStefano Zampini k = B->cmap->n; 1498ccdfe979SStefano Zampini n = B->rmap->n; 1499ccdfe979SStefano Zampini break; 1500ccdfe979SStefano Zampini default: 1501ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1502ccdfe979SStefano Zampini } 1503ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1504ccdfe979SStefano Zampini matA = mat->descr; 1505ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1506ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1507ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1508ccdfe979SStefano Zampini if (!biscuda) { 1509ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1510ccdfe979SStefano Zampini } 1511ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1512ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1513c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1514c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1515c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1516c8378d12SStefano Zampini } else { 1517c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1518c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1519c8378d12SStefano Zampini } 1520c8378d12SStefano Zampini 1521c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1522c8378d12SStefano Zampini 1523ccdfe979SStefano Zampini /* cusparse spmm does not support transpose on B */ 1524ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1525ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 1526ccdfe979SStefano Zampini cublasStatus_t cerr; 1527ccdfe979SStefano Zampini 1528ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1529ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1530ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 1531ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 1532ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 1533ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1534ccdfe979SStefano Zampini blda = B->cmap->n; 1535ccdfe979SStefano Zampini } 1536ccdfe979SStefano Zampini 1537ccdfe979SStefano Zampini /* perform the MatMat operation */ 1538ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1539ccdfe979SStefano Zampini csrmat->num_entries,mat->alpha,matA, 1540ccdfe979SStefano Zampini csrmat->values->data().get(), 1541ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 1542ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 1543ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1544ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 1545c8378d12SStefano Zampini cuer = WaitForGPU();CHKERRCUDA(cuer); 1546c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1547c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1548ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1549ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 1550ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1551ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1552ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 1553ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1554ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1555ccdfe979SStefano Zampini } else { 1556ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1557ccdfe979SStefano Zampini } 1558ccdfe979SStefano Zampini if (mmdata->cisdense) { 1559ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1560ccdfe979SStefano Zampini } 1561ccdfe979SStefano Zampini if (!biscuda) { 1562ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1563ccdfe979SStefano Zampini } 1564ccdfe979SStefano Zampini PetscFunctionReturn(0); 1565ccdfe979SStefano Zampini } 1566ccdfe979SStefano Zampini 1567ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1568ccdfe979SStefano Zampini { 1569ccdfe979SStefano Zampini Mat_Product *product = C->product; 1570ccdfe979SStefano Zampini Mat A,B; 1571ccdfe979SStefano Zampini PetscInt m,n; 1572ccdfe979SStefano Zampini PetscBool cisdense,flg; 1573ccdfe979SStefano Zampini PetscErrorCode ierr; 1574ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1575ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1576ccdfe979SStefano Zampini 1577ccdfe979SStefano Zampini PetscFunctionBegin; 1578ccdfe979SStefano Zampini MatCheckProduct(C,1); 1579ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1580ccdfe979SStefano Zampini A = product->A; 1581ccdfe979SStefano Zampini B = product->B; 1582ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1583ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1584ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1585ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1586ccdfe979SStefano Zampini switch (product->type) { 1587ccdfe979SStefano Zampini case MATPRODUCT_AB: 1588ccdfe979SStefano Zampini m = A->rmap->n; 1589ccdfe979SStefano Zampini n = B->cmap->n; 1590ccdfe979SStefano Zampini break; 1591ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1592ccdfe979SStefano Zampini m = A->cmap->n; 1593ccdfe979SStefano Zampini n = B->cmap->n; 1594ccdfe979SStefano Zampini break; 1595ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1596ccdfe979SStefano Zampini m = A->rmap->n; 1597ccdfe979SStefano Zampini n = B->rmap->n; 1598ccdfe979SStefano Zampini break; 1599ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1600ccdfe979SStefano Zampini m = B->cmap->n; 1601ccdfe979SStefano Zampini n = B->cmap->n; 1602ccdfe979SStefano Zampini break; 1603ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1604ccdfe979SStefano Zampini m = B->rmap->n; 1605ccdfe979SStefano Zampini n = B->rmap->n; 1606ccdfe979SStefano Zampini break; 1607ccdfe979SStefano Zampini default: 1608ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1609ccdfe979SStefano Zampini } 1610ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1611ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1612ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1613ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1614ccdfe979SStefano Zampini 1615ccdfe979SStefano Zampini /* product data */ 1616ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1617ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 1618ccdfe979SStefano Zampini /* cusparse spmm does not support transpose on B */ 1619ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1620ccdfe979SStefano Zampini cudaError_t cerr; 1621ccdfe979SStefano Zampini 1622ccdfe979SStefano Zampini cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1623ccdfe979SStefano Zampini } 1624ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 1625ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1626ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1627ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1628ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1629ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1630ccdfe979SStefano Zampini } else { 1631ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1632ccdfe979SStefano Zampini } 1633ccdfe979SStefano Zampini } 1634ccdfe979SStefano Zampini C->product->data = mmdata; 1635ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 1636ccdfe979SStefano Zampini 1637ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1638ccdfe979SStefano Zampini PetscFunctionReturn(0); 1639ccdfe979SStefano Zampini } 1640ccdfe979SStefano Zampini 1641ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1642ccdfe979SStefano Zampini 1643ccdfe979SStefano Zampini /* handles dense B */ 1644ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1645ccdfe979SStefano Zampini { 1646ccdfe979SStefano Zampini Mat_Product *product = C->product; 1647ccdfe979SStefano Zampini PetscErrorCode ierr; 1648ccdfe979SStefano Zampini 1649ccdfe979SStefano Zampini PetscFunctionBegin; 1650ccdfe979SStefano Zampini MatCheckProduct(C,1); 1651ccdfe979SStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1652ccdfe979SStefano Zampini if (product->A->boundtocpu) { 1653ccdfe979SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1654ccdfe979SStefano Zampini PetscFunctionReturn(0); 1655ccdfe979SStefano Zampini } 1656ccdfe979SStefano Zampini switch (product->type) { 1657ccdfe979SStefano Zampini case MATPRODUCT_AB: 1658ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1659ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1660ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1661ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1662ccdfe979SStefano Zampini C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1663ccdfe979SStefano Zampini default: 1664ccdfe979SStefano Zampini break; 1665ccdfe979SStefano Zampini } 1666ccdfe979SStefano Zampini PetscFunctionReturn(0); 1667ccdfe979SStefano Zampini } 1668ccdfe979SStefano Zampini 16696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 16709ae82921SPaul Mullowney { 1671b175d8bbSPaul Mullowney PetscErrorCode ierr; 16729ae82921SPaul Mullowney 16739ae82921SPaul Mullowney PetscFunctionBegin; 1674*e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1675*e6e9a74fSStefano Zampini PetscFunctionReturn(0); 1676*e6e9a74fSStefano Zampini } 1677*e6e9a74fSStefano Zampini 1678*e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1679*e6e9a74fSStefano Zampini { 1680*e6e9a74fSStefano Zampini PetscErrorCode ierr; 1681*e6e9a74fSStefano Zampini 1682*e6e9a74fSStefano Zampini PetscFunctionBegin; 1683*e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1684*e6e9a74fSStefano Zampini PetscFunctionReturn(0); 1685*e6e9a74fSStefano Zampini } 1686*e6e9a74fSStefano Zampini 1687*e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1688*e6e9a74fSStefano Zampini { 1689*e6e9a74fSStefano Zampini PetscErrorCode ierr; 1690*e6e9a74fSStefano Zampini 1691*e6e9a74fSStefano Zampini PetscFunctionBegin; 1692*e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1693*e6e9a74fSStefano Zampini PetscFunctionReturn(0); 1694*e6e9a74fSStefano Zampini } 1695*e6e9a74fSStefano Zampini 1696*e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1697*e6e9a74fSStefano Zampini { 1698*e6e9a74fSStefano Zampini PetscErrorCode ierr; 1699*e6e9a74fSStefano Zampini 1700*e6e9a74fSStefano Zampini PetscFunctionBegin; 1701*e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 17029ae82921SPaul Mullowney PetscFunctionReturn(0); 17039ae82921SPaul Mullowney } 17049ae82921SPaul Mullowney 17056fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1706ca45077fSPaul Mullowney { 1707b175d8bbSPaul Mullowney PetscErrorCode ierr; 1708ca45077fSPaul Mullowney 1709ca45077fSPaul Mullowney PetscFunctionBegin; 1710*e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1711ca45077fSPaul Mullowney PetscFunctionReturn(0); 1712ca45077fSPaul Mullowney } 1713ca45077fSPaul Mullowney 1714*e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 17159ae82921SPaul Mullowney { 17169ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1717aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 17189ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1719*e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1720b175d8bbSPaul Mullowney PetscErrorCode ierr; 172157d48284SJunchao Zhang cudaError_t cerr; 1722aa372e3fSPaul Mullowney cusparseStatus_t stat; 1723*e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1724*e6e9a74fSStefano Zampini PetscBool compressed; 17256e111a19SKarl Rupp 17269ae82921SPaul Mullowney PetscFunctionBegin; 1727*e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1728*e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 1729*e6e9a74fSStefano Zampini if (!yy) { 1730*e6e9a74fSStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1731*e6e9a74fSStefano Zampini } 1732*e6e9a74fSStefano Zampini PetscFunctionReturn(0); 1733*e6e9a74fSStefano Zampini } 173434d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 173534d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1736*e6e9a74fSStefano Zampini if (!trans) { 17379ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1738*e6e9a74fSStefano Zampini } else { 1739*e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 1740*e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1741*e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1742*e6e9a74fSStefano Zampini } else { 1743*e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1744*e6e9a74fSStefano Zampini if (!matstruct) { 1745*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1746*e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1747*e6e9a74fSStefano Zampini } 1748*e6e9a74fSStefano Zampini } 1749*e6e9a74fSStefano Zampini } 1750*e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1751*e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1752213423ffSJunchao Zhang 1753*e6e9a74fSStefano Zampini try { 1754*e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1755213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1756213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1757*e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1758*e6e9a74fSStefano Zampini xptr = xarray; 1759*e6e9a74fSStefano Zampini dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1760213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1761*e6e9a74fSStefano Zampini } else { 1762*e6e9a74fSStefano Zampini xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1763*e6e9a74fSStefano Zampini dptr = zarray; 1764*e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1765*e6e9a74fSStefano Zampini 1766*e6e9a74fSStefano Zampini if (compressed) { 1767*e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1768*e6e9a74fSStefano Zampini 1769*e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1770*e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1771*e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1772*e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 1773*e6e9a74fSStefano Zampini cerr = WaitForGPU();CHKERRCUDA(cerr); 1774*e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1775*e6e9a74fSStefano Zampini } 1776*e6e9a74fSStefano Zampini } 17779ae82921SPaul Mullowney 17787a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1779213423ffSJunchao Zhang /* csr_spmv does y = alpha*Ax + beta*y */ 1780aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 17817656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1782*e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1783a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 1784b06137fdSPaul Mullowney mat->num_entries, matstruct->alpha, matstruct->descr, 1785aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1786*e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 178757d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 1788aa372e3fSPaul Mullowney } else { 1789213423ffSJunchao Zhang if (cusparsestruct->nrows) { 1790301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1791*e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1792b06137fdSPaul Mullowney matstruct->alpha, matstruct->descr, hybMat, 1793*e6e9a74fSStefano Zampini xptr, beta, 179457d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 1795a65300a6SPaul Mullowney } 1796aa372e3fSPaul Mullowney } 1797*e6e9a74fSStefano Zampini cerr = WaitForGPU();CHKERRCUDA(cerr); 1798958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1799aa372e3fSPaul Mullowney 1800*e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1801213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1802213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1803213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1804*e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1805213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 18067656d835SStefano Zampini } 1807213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1808c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 18097656d835SStefano Zampini } 18107656d835SStefano Zampini 1811213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1812213423ffSJunchao Zhang if (compressed) { 1813213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1814*e6e9a74fSStefano Zampini 1815*e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1816c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1817*e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1818c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 1819*e6e9a74fSStefano Zampini cerr = WaitForGPU();CHKERRCUDA(cerr); 1820958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1821*e6e9a74fSStefano Zampini } 1822*e6e9a74fSStefano Zampini } else { 1823*e6e9a74fSStefano Zampini if (yy && yy != zz) { 1824*e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1825*e6e9a74fSStefano Zampini } 1826*e6e9a74fSStefano Zampini } 1827*e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1828213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1829213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 18309ae82921SPaul Mullowney } catch(char *ex) { 18319ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18329ae82921SPaul Mullowney } 1833*e6e9a74fSStefano Zampini if (yy) { 1834958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1835*e6e9a74fSStefano Zampini } else { 1836*e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1837*e6e9a74fSStefano Zampini } 18389ae82921SPaul Mullowney PetscFunctionReturn(0); 18399ae82921SPaul Mullowney } 18409ae82921SPaul Mullowney 18416fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1842ca45077fSPaul Mullowney { 1843b175d8bbSPaul Mullowney PetscErrorCode ierr; 18446e111a19SKarl Rupp 1845ca45077fSPaul Mullowney PetscFunctionBegin; 1846*e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1847ca45077fSPaul Mullowney PetscFunctionReturn(0); 1848ca45077fSPaul Mullowney } 1849ca45077fSPaul Mullowney 18506fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 18519ae82921SPaul Mullowney { 18529ae82921SPaul Mullowney PetscErrorCode ierr; 18536e111a19SKarl Rupp 18549ae82921SPaul Mullowney PetscFunctionBegin; 18559ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 185695639643SRichard Tran Mills if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1857bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1858e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1859bc3f50f2SPaul Mullowney } 18609ae82921SPaul Mullowney PetscFunctionReturn(0); 18619ae82921SPaul Mullowney } 18629ae82921SPaul Mullowney 18639ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 1864e057df02SPaul Mullowney /*@ 18659ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1866e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 1867e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1868e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 1869e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 1870e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 18719ae82921SPaul Mullowney 1872d083f849SBarry Smith Collective 18739ae82921SPaul Mullowney 18749ae82921SPaul Mullowney Input Parameters: 18759ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 18769ae82921SPaul Mullowney . m - number of rows 18779ae82921SPaul Mullowney . n - number of columns 18789ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 18799ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 18800298fd71SBarry Smith (possibly different for each row) or NULL 18819ae82921SPaul Mullowney 18829ae82921SPaul Mullowney Output Parameter: 18839ae82921SPaul Mullowney . A - the matrix 18849ae82921SPaul Mullowney 18859ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 18869ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 18879ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 18889ae82921SPaul Mullowney 18899ae82921SPaul Mullowney Notes: 18909ae82921SPaul Mullowney If nnz is given then nz is ignored 18919ae82921SPaul Mullowney 18929ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 18939ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 18949ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 18959ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 18969ae82921SPaul Mullowney 18979ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 18980298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 18999ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 19009ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 19019ae82921SPaul Mullowney 19029ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 19039ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 19049ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 19059ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 19069ae82921SPaul Mullowney 19079ae82921SPaul Mullowney Level: intermediate 19089ae82921SPaul Mullowney 1909e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 19109ae82921SPaul Mullowney @*/ 19119ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 19129ae82921SPaul Mullowney { 19139ae82921SPaul Mullowney PetscErrorCode ierr; 19149ae82921SPaul Mullowney 19159ae82921SPaul Mullowney PetscFunctionBegin; 19169ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 19179ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 19189ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 19199ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 19209ae82921SPaul Mullowney PetscFunctionReturn(0); 19219ae82921SPaul Mullowney } 19229ae82921SPaul Mullowney 19236fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 19249ae82921SPaul Mullowney { 19259ae82921SPaul Mullowney PetscErrorCode ierr; 1926ab25e6cbSDominic Meiser 19279ae82921SPaul Mullowney PetscFunctionBegin; 19289ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1929c70f7ee4SJunchao Zhang if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { 1930470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 19319ae82921SPaul Mullowney } 19329ae82921SPaul Mullowney } else { 1933470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1934aa372e3fSPaul Mullowney } 1935ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1936ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1937ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1938ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 19399ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 19409ae82921SPaul Mullowney PetscFunctionReturn(0); 19419ae82921SPaul Mullowney } 19429ae82921SPaul Mullowney 1943ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 194495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 19459ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 19469ff858a8SKarl Rupp { 19479ff858a8SKarl Rupp PetscErrorCode ierr; 19489ff858a8SKarl Rupp 19499ff858a8SKarl Rupp PetscFunctionBegin; 19509ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1951ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 19529ff858a8SKarl Rupp PetscFunctionReturn(0); 19539ff858a8SKarl Rupp } 19549ff858a8SKarl Rupp 195595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 195695639643SRichard Tran Mills { 1957*e6e9a74fSStefano Zampini PetscErrorCode ierr; 1958*e6e9a74fSStefano Zampini 195995639643SRichard Tran Mills PetscFunctionBegin; 1960*e6e9a74fSStefano Zampini if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1961c34f1ff0SRichard Tran Mills /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 196295639643SRichard Tran Mills If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1963*e6e9a74fSStefano Zampini Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 196495639643SRichard Tran Mills TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 196595639643SRichard Tran Mills can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1966*e6e9a74fSStefano Zampini if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1967*e6e9a74fSStefano Zampini /* TODO: add support for this? */ 196895639643SRichard Tran Mills if (flg) { 196995639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJ; 197095639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJ; 1971c34f1ff0SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1972c34f1ff0SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1973*e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = NULL; 1974*e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = NULL; 197595639643SRichard Tran Mills A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 197695639643SRichard Tran Mills A->ops->duplicate = MatDuplicate_SeqAIJ; 1977*e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1978*e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 197995639643SRichard Tran Mills } else { 198095639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJCUSPARSE; 198195639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 198295639643SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 198395639643SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1984*e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 1985*e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 198695639643SRichard Tran Mills A->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 198795639643SRichard Tran Mills A->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1988*e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1989*e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 199095639643SRichard Tran Mills } 199195639643SRichard Tran Mills A->boundtocpu = flg; 199295639643SRichard Tran Mills PetscFunctionReturn(0); 199395639643SRichard Tran Mills } 199495639643SRichard Tran Mills 199549735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 19969ae82921SPaul Mullowney { 19979ae82921SPaul Mullowney PetscErrorCode ierr; 1998aa372e3fSPaul Mullowney cusparseStatus_t stat; 199949735bf3SStefano Zampini Mat B; 20009ae82921SPaul Mullowney 20019ae82921SPaul Mullowney PetscFunctionBegin; 200249735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 200349735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 200449735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 200549735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 200649735bf3SStefano Zampini } 200749735bf3SStefano Zampini B = *newmat; 200849735bf3SStefano Zampini 200934136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 201034136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 201134136279SStefano Zampini 201249735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 20139ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 2014*e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 2015*e6e9a74fSStefano Zampini 2016*e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2017*e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 2018*e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2019*e6e9a74fSStefano Zampini B->spptr = spptr; 20209ae82921SPaul Mullowney } else { 2021*e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 2022*e6e9a74fSStefano Zampini 2023*e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2024*e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2025*e6e9a74fSStefano Zampini B->spptr = spptr; 20269ae82921SPaul Mullowney } 2027*e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 202849735bf3SStefano Zampini } 20299ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 20309ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 203195639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 20322205254eSKarl Rupp 2033*e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 20349ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 20352205254eSKarl Rupp 2036bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 20379ae82921SPaul Mullowney PetscFunctionReturn(0); 20389ae82921SPaul Mullowney } 20399ae82921SPaul Mullowney 204002fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 204102fe1965SBarry Smith { 204202fe1965SBarry Smith PetscErrorCode ierr; 204302fe1965SBarry Smith 204402fe1965SBarry Smith PetscFunctionBegin; 204502fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 20460ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 204702fe1965SBarry Smith PetscFunctionReturn(0); 204802fe1965SBarry Smith } 204902fe1965SBarry Smith 20503ca39a21SBarry Smith /*MC 2051e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2052e057df02SPaul Mullowney 2053e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 20542692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 20552692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2056e057df02SPaul Mullowney 2057e057df02SPaul Mullowney Options Database Keys: 2058e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2059aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2060a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2061e057df02SPaul Mullowney 2062e057df02SPaul Mullowney Level: beginner 2063e057df02SPaul Mullowney 20648468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2065e057df02SPaul Mullowney M*/ 20667f756511SDominic Meiser 206742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 206842c9c57cSBarry Smith 20690f39cd5aSBarry Smith 20703ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 207142c9c57cSBarry Smith { 207242c9c57cSBarry Smith PetscErrorCode ierr; 207342c9c57cSBarry Smith 207442c9c57cSBarry Smith PetscFunctionBegin; 20753ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 20763ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 20773ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 20783ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 207942c9c57cSBarry Smith PetscFunctionReturn(0); 208042c9c57cSBarry Smith } 208129b38603SBarry Smith 2082470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 20837f756511SDominic Meiser { 2084*e6e9a74fSStefano Zampini PetscErrorCode ierr; 20857f756511SDominic Meiser cusparseStatus_t stat; 20867f756511SDominic Meiser cusparseHandle_t handle; 20877f756511SDominic Meiser 20887f756511SDominic Meiser PetscFunctionBegin; 20897f756511SDominic Meiser if (*cusparsestruct) { 2090*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2091*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 20927f756511SDominic Meiser delete (*cusparsestruct)->workVector; 209381902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 20947f756511SDominic Meiser if (handle = (*cusparsestruct)->handle) { 209557d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 20967f756511SDominic Meiser } 2097*e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 20987f756511SDominic Meiser } 20997f756511SDominic Meiser PetscFunctionReturn(0); 21007f756511SDominic Meiser } 21017f756511SDominic Meiser 21027f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 21037f756511SDominic Meiser { 21047f756511SDominic Meiser PetscFunctionBegin; 21057f756511SDominic Meiser if (*mat) { 21067f756511SDominic Meiser delete (*mat)->values; 21077f756511SDominic Meiser delete (*mat)->column_indices; 21087f756511SDominic Meiser delete (*mat)->row_offsets; 21097f756511SDominic Meiser delete *mat; 21107f756511SDominic Meiser *mat = 0; 21117f756511SDominic Meiser } 21127f756511SDominic Meiser PetscFunctionReturn(0); 21137f756511SDominic Meiser } 21147f756511SDominic Meiser 2115470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 21167f756511SDominic Meiser { 21177f756511SDominic Meiser cusparseStatus_t stat; 21187f756511SDominic Meiser PetscErrorCode ierr; 21197f756511SDominic Meiser 21207f756511SDominic Meiser PetscFunctionBegin; 21217f756511SDominic Meiser if (*trifactor) { 212257d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 212357d48284SJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 21247f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 21257f756511SDominic Meiser delete *trifactor; 21267f756511SDominic Meiser *trifactor = 0; 21277f756511SDominic Meiser } 21287f756511SDominic Meiser PetscFunctionReturn(0); 21297f756511SDominic Meiser } 21307f756511SDominic Meiser 2131470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 21327f756511SDominic Meiser { 21337f756511SDominic Meiser CsrMatrix *mat; 21347f756511SDominic Meiser cusparseStatus_t stat; 21357f756511SDominic Meiser cudaError_t err; 21367f756511SDominic Meiser 21377f756511SDominic Meiser PetscFunctionBegin; 21387f756511SDominic Meiser if (*matstruct) { 21397f756511SDominic Meiser if ((*matstruct)->mat) { 21407f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 21417f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 214257d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 21437f756511SDominic Meiser } else { 21447f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 21457f756511SDominic Meiser CsrMatrix_Destroy(&mat); 21467f756511SDominic Meiser } 21477f756511SDominic Meiser } 214857d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 21497f756511SDominic Meiser delete (*matstruct)->cprowIndices; 2150c41cb2e2SAlejandro Lamas Daviña if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 21517656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 21527656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 21537f756511SDominic Meiser delete *matstruct; 21547f756511SDominic Meiser *matstruct = 0; 21557f756511SDominic Meiser } 21567f756511SDominic Meiser PetscFunctionReturn(0); 21577f756511SDominic Meiser } 21587f756511SDominic Meiser 2159ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 21607f756511SDominic Meiser { 2161*e6e9a74fSStefano Zampini PetscErrorCode ierr; 2162*e6e9a74fSStefano Zampini 21637f756511SDominic Meiser PetscFunctionBegin; 21647f756511SDominic Meiser if (*trifactors) { 2165*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2166*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2167*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2168*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 21697f756511SDominic Meiser delete (*trifactors)->rpermIndices; 21707f756511SDominic Meiser delete (*trifactors)->cpermIndices; 21717f756511SDominic Meiser delete (*trifactors)->workVector; 2172ccdfe979SStefano Zampini (*trifactors)->rpermIndices = 0; 2173ccdfe979SStefano Zampini (*trifactors)->cpermIndices = 0; 2174ccdfe979SStefano Zampini (*trifactors)->workVector = 0; 2175ccdfe979SStefano Zampini } 2176ccdfe979SStefano Zampini PetscFunctionReturn(0); 2177ccdfe979SStefano Zampini } 2178ccdfe979SStefano Zampini 2179ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2180ccdfe979SStefano Zampini { 2181*e6e9a74fSStefano Zampini PetscErrorCode ierr; 2182ccdfe979SStefano Zampini cusparseHandle_t handle; 2183ccdfe979SStefano Zampini cusparseStatus_t stat; 2184ccdfe979SStefano Zampini 2185ccdfe979SStefano Zampini PetscFunctionBegin; 2186ccdfe979SStefano Zampini if (*trifactors) { 2187*e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 21887f756511SDominic Meiser if (handle = (*trifactors)->handle) { 218957d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 21907f756511SDominic Meiser } 2191*e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 21927f756511SDominic Meiser } 21937f756511SDominic Meiser PetscFunctionReturn(0); 21947f756511SDominic Meiser } 2195