19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17*bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 18*bddcd29dSMark Adams #include <cooperative_groups.h> 19*bddcd29dSMark Adams #endif 20e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 21afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 22afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 23afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 24afb2bd1cSJunchao Zhang 25afb2bd1cSJunchao Zhang typedef enum { 26afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 27afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 28afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 29afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 30afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 31afb2bd1cSJunchao Zhang 32afb2bd1cSJunchao Zhang typedef enum { 33afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 34afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 35afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 37afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 45afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 46afb2bd1cSJunchao Zhang 47afb2bd1cSJunchao Zhang typedef enum { 48afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 49afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 50afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 51afb2bd1cSJunchao Zhang */ 52afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 53afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 55afb2bd1cSJunchao Zhang #endif 569ae82921SPaul Mullowney 57087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 58087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 59087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 60087f3262SPaul Mullowney 61*bddcd29dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*); 62*bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*); 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 887f756511SDominic Meiser 8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9257181aedSStefano Zampini 937e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 947e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 957e8381f9SStefano Zampini 96c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 97c215019aSStefano Zampini 98b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 99b06137fdSPaul Mullowney { 100b06137fdSPaul Mullowney cusparseStatus_t stat; 101b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102b06137fdSPaul Mullowney 103b06137fdSPaul Mullowney PetscFunctionBegin; 104d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105b06137fdSPaul Mullowney cusparsestruct->stream = stream; 10657d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 107b06137fdSPaul Mullowney PetscFunctionReturn(0); 108b06137fdSPaul Mullowney } 109b06137fdSPaul Mullowney 110b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 111b06137fdSPaul Mullowney { 112b06137fdSPaul Mullowney cusparseStatus_t stat; 113b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 114b06137fdSPaul Mullowney 115b06137fdSPaul Mullowney PetscFunctionBegin; 116d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1176b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 11816a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11957d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 12016a2e217SAlejandro Lamas Daviña } 121b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1226b1cf21dSAlejandro Lamas Daviña } 12357d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 124b06137fdSPaul Mullowney PetscFunctionReturn(0); 125b06137fdSPaul Mullowney } 126b06137fdSPaul Mullowney 127b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 128b06137fdSPaul Mullowney { 129b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1307e8381f9SStefano Zampini PetscBool flg; 1317e8381f9SStefano Zampini PetscErrorCode ierr; 132ccdfe979SStefano Zampini 133b06137fdSPaul Mullowney PetscFunctionBegin; 1347e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1357e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 136ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 137b06137fdSPaul Mullowney PetscFunctionReturn(0); 138b06137fdSPaul Mullowney } 139b06137fdSPaul Mullowney 140ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1419ae82921SPaul Mullowney { 1429ae82921SPaul Mullowney PetscFunctionBegin; 1439ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1449ae82921SPaul Mullowney PetscFunctionReturn(0); 1459ae82921SPaul Mullowney } 1469ae82921SPaul Mullowney 147c708e6cdSJed Brown /*MC 148087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 149087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 150087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 151087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 152087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 153087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 154c708e6cdSJed Brown 1559ae82921SPaul Mullowney Level: beginner 156c708e6cdSJed Brown 1573ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158c708e6cdSJed Brown M*/ 1599ae82921SPaul Mullowney 16042c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1619ae82921SPaul Mullowney { 1629ae82921SPaul Mullowney PetscErrorCode ierr; 163bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1649ae82921SPaul Mullowney 1659ae82921SPaul Mullowney PetscFunctionBegin; 166bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 167bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1682c7c0729SBarry Smith (*B)->factortype = ftype; 1692c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1709ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1712205254eSKarl Rupp 172087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 17333d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1749ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1759ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 176087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 177087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 178087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1799ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 180bc3f50f2SPaul Mullowney 181fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1823ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1839ae82921SPaul Mullowney PetscFunctionReturn(0); 1849ae82921SPaul Mullowney } 1859ae82921SPaul Mullowney 186bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 187ca45077fSPaul Mullowney { 188aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1896e111a19SKarl Rupp 190ca45077fSPaul Mullowney PetscFunctionBegin; 191ca45077fSPaul Mullowney switch (op) { 192e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 193aa372e3fSPaul Mullowney cusparsestruct->format = format; 194ca45077fSPaul Mullowney break; 195e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 196aa372e3fSPaul Mullowney cusparsestruct->format = format; 197ca45077fSPaul Mullowney break; 198ca45077fSPaul Mullowney default: 19936d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 200ca45077fSPaul Mullowney } 201ca45077fSPaul Mullowney PetscFunctionReturn(0); 202ca45077fSPaul Mullowney } 2039ae82921SPaul Mullowney 204e057df02SPaul Mullowney /*@ 205e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 206e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 207aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 208e057df02SPaul Mullowney Not Collective 209e057df02SPaul Mullowney 210e057df02SPaul Mullowney Input Parameters: 2118468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 21236d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2132692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 214e057df02SPaul Mullowney 215e057df02SPaul Mullowney Output Parameter: 216e057df02SPaul Mullowney 217e057df02SPaul Mullowney Level: intermediate 218e057df02SPaul Mullowney 2198468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 220e057df02SPaul Mullowney @*/ 221e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 222e057df02SPaul Mullowney { 223e057df02SPaul Mullowney PetscErrorCode ierr; 2246e111a19SKarl Rupp 225e057df02SPaul Mullowney PetscFunctionBegin; 226e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 227e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 228e057df02SPaul Mullowney PetscFunctionReturn(0); 229e057df02SPaul Mullowney } 230e057df02SPaul Mullowney 2311a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 232e6e9a74fSStefano Zampini { 233e6e9a74fSStefano Zampini PetscErrorCode ierr; 234e6e9a74fSStefano Zampini 235e6e9a74fSStefano Zampini PetscFunctionBegin; 2361a2c6b5cSJunchao Zhang switch (op) { 2371a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 2381a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 2391a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 2401a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 2411a2c6b5cSJunchao Zhang break; 2421a2c6b5cSJunchao Zhang default: 2431a2c6b5cSJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 2441a2c6b5cSJunchao Zhang break; 245e6e9a74fSStefano Zampini } 246e6e9a74fSStefano Zampini PetscFunctionReturn(0); 247e6e9a74fSStefano Zampini } 248e6e9a74fSStefano Zampini 249*bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 250*bddcd29dSMark Adams 251*bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 252*bddcd29dSMark Adams { 253*bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254*bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 255*bddcd29dSMark Adams PetscBool row_identity,col_identity; 256*bddcd29dSMark Adams PetscErrorCode ierr; 257*bddcd29dSMark Adams 258*bddcd29dSMark Adams PetscFunctionBegin; 259*bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 260*bddcd29dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 261*bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 262*bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 263*bddcd29dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 264*bddcd29dSMark Adams ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 265*bddcd29dSMark Adams if (row_identity && col_identity) { 266*bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 267*bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 268*bddcd29dSMark Adams B->ops->matsolve = NULL; 269*bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 270*bddcd29dSMark Adams } else { 271*bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 272*bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 273*bddcd29dSMark Adams B->ops->matsolve = NULL; 274*bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 275*bddcd29dSMark Adams } 276*bddcd29dSMark Adams 277*bddcd29dSMark Adams /* get the triangular factors */ 278*bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 279*bddcd29dSMark Adams PetscFunctionReturn(0); 280*bddcd29dSMark Adams } 281*bddcd29dSMark Adams 2824416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2839ae82921SPaul Mullowney { 2849ae82921SPaul Mullowney PetscErrorCode ierr; 285e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2869ae82921SPaul Mullowney PetscBool flg; 287a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2886e111a19SKarl Rupp 2899ae82921SPaul Mullowney PetscFunctionBegin; 290e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2919ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 292e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 293a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 294afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 295afb2bd1cSJunchao Zhang 2964c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 297a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 298afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 299afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 300afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 301afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 302afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 303afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304afb2bd1cSJunchao Zhang 305afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 306afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 307afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 308afb2bd1cSJunchao Zhang 309afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 310afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 311afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 312afb2bd1cSJunchao Zhang #endif 3134c87dfd4SPaul Mullowney } 3140af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 3159ae82921SPaul Mullowney PetscFunctionReturn(0); 3169ae82921SPaul Mullowney } 3179ae82921SPaul Mullowney 3186fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3199ae82921SPaul Mullowney { 320da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3219ae82921SPaul Mullowney PetscErrorCode ierr; 3229ae82921SPaul Mullowney 3239ae82921SPaul Mullowney PetscFunctionBegin; 324da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3259ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3269ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3279ae82921SPaul Mullowney PetscFunctionReturn(0); 3289ae82921SPaul Mullowney } 3299ae82921SPaul Mullowney 3306fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3319ae82921SPaul Mullowney { 332da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3339ae82921SPaul Mullowney PetscErrorCode ierr; 3349ae82921SPaul Mullowney 3359ae82921SPaul Mullowney PetscFunctionBegin; 336da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3379ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3389ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3399ae82921SPaul Mullowney PetscFunctionReturn(0); 3409ae82921SPaul Mullowney } 3419ae82921SPaul Mullowney 342087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 343087f3262SPaul Mullowney { 344da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 345087f3262SPaul Mullowney PetscErrorCode ierr; 346087f3262SPaul Mullowney 347087f3262SPaul Mullowney PetscFunctionBegin; 348da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 349087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 350087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 351087f3262SPaul Mullowney PetscFunctionReturn(0); 352087f3262SPaul Mullowney } 353087f3262SPaul Mullowney 354087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 355087f3262SPaul Mullowney { 356da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 357087f3262SPaul Mullowney PetscErrorCode ierr; 358087f3262SPaul Mullowney 359087f3262SPaul Mullowney PetscFunctionBegin; 360da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 361087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 362087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 363087f3262SPaul Mullowney PetscFunctionReturn(0); 364087f3262SPaul Mullowney } 365087f3262SPaul Mullowney 366087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3679ae82921SPaul Mullowney { 3689ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3699ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3709ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3729ae82921SPaul Mullowney cusparseStatus_t stat; 3739ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3749ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3759ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3769ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 377b175d8bbSPaul Mullowney PetscErrorCode ierr; 37857d48284SJunchao Zhang cudaError_t cerr; 3799ae82921SPaul Mullowney 3809ae82921SPaul Mullowney PetscFunctionBegin; 381cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 382c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3839ae82921SPaul Mullowney try { 3849ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3859ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 386da79fbbcSStefano Zampini if (!loTriFactor) { 3872cbc15d9SMark PetscScalar *AALo; 3882cbc15d9SMark 3892cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3909ae82921SPaul Mullowney 3919ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 39257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 39357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3969ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3979ae82921SPaul Mullowney AiLo[n] = nzLower; 3989ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3999ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 4009ae82921SPaul Mullowney v = aa; 4019ae82921SPaul Mullowney vi = aj; 4029ae82921SPaul Mullowney offset = 1; 4039ae82921SPaul Mullowney rowOffset= 1; 4049ae82921SPaul Mullowney for (i=1; i<n; i++) { 4059ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 406e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 4079ae82921SPaul Mullowney AiLo[i] = rowOffset; 4089ae82921SPaul Mullowney rowOffset += nz+1; 4099ae82921SPaul Mullowney 410580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 411580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 4129ae82921SPaul Mullowney 4139ae82921SPaul Mullowney offset += nz; 4149ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 4159ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 4169ae82921SPaul Mullowney offset += 1; 4179ae82921SPaul Mullowney 4189ae82921SPaul Mullowney v += nz; 4199ae82921SPaul Mullowney vi += nz; 4209ae82921SPaul Mullowney } 4212205254eSKarl Rupp 422aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 423da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 424da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 425aa372e3fSPaul Mullowney /* Create the matrix description */ 42657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 42757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 429afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 430afb2bd1cSJunchao Zhang #else 43157d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 432afb2bd1cSJunchao Zhang #endif 43357d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 43457d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 435aa372e3fSPaul Mullowney 436aa372e3fSPaul Mullowney /* set the operation */ 437aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438aa372e3fSPaul Mullowney 439aa372e3fSPaul Mullowney /* set the matrix */ 440aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 441aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 442aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 443aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 444aa372e3fSPaul Mullowney 445aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 447aa372e3fSPaul Mullowney 448aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 449aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 450aa372e3fSPaul Mullowney 451aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 452aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 453aa372e3fSPaul Mullowney 454afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 455da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 456afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4571b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 459afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 460afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 461afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 462afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 463afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 464afb2bd1cSJunchao Zhang #endif 465afb2bd1cSJunchao Zhang 466aa372e3fSPaul Mullowney /* perform the solve analysis */ 467aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 468aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 469aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 470afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 4711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 472afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 473afb2bd1cSJunchao Zhang #endif 474afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 475da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 476da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 477aa372e3fSPaul Mullowney 478da79fbbcSStefano Zampini /* assign the pointer */ 479aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4802cbc15d9SMark loTriFactor->AA_h = AALo; 48157d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 48257d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4834863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484da79fbbcSStefano Zampini } else { /* update values only */ 4852cbc15d9SMark if (!loTriFactor->AA_h) { 4862cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4872cbc15d9SMark } 488da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4892cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 490da79fbbcSStefano Zampini v = aa; 491da79fbbcSStefano Zampini vi = aj; 492da79fbbcSStefano Zampini offset = 1; 493da79fbbcSStefano Zampini for (i=1; i<n; i++) { 494da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4952cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 496da79fbbcSStefano Zampini offset += nz; 4972cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 498da79fbbcSStefano Zampini offset += 1; 499da79fbbcSStefano Zampini v += nz; 500da79fbbcSStefano Zampini } 5012cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 502da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 503da79fbbcSStefano Zampini } 5049ae82921SPaul Mullowney } catch(char *ex) { 5059ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5069ae82921SPaul Mullowney } 5079ae82921SPaul Mullowney } 5089ae82921SPaul Mullowney PetscFunctionReturn(0); 5099ae82921SPaul Mullowney } 5109ae82921SPaul Mullowney 511087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 5129ae82921SPaul Mullowney { 5139ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5149ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5159ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 516aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 5179ae82921SPaul Mullowney cusparseStatus_t stat; 5189ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5199ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5209ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5219ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5229ae82921SPaul Mullowney PetscErrorCode ierr; 52357d48284SJunchao Zhang cudaError_t cerr; 5249ae82921SPaul Mullowney 5259ae82921SPaul Mullowney PetscFunctionBegin; 526cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 527c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5289ae82921SPaul Mullowney try { 5299ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5309ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 531da79fbbcSStefano Zampini if (!upTriFactor) { 5322cbc15d9SMark PetscScalar *AAUp; 5332cbc15d9SMark 5342cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5352cbc15d9SMark 5369ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 53757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 53857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5399ae82921SPaul Mullowney 5409ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5419ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5429ae82921SPaul Mullowney AiUp[n]=nzUpper; 5439ae82921SPaul Mullowney offset = nzUpper; 5449ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5459ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5469ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5479ae82921SPaul Mullowney 548e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5499ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5509ae82921SPaul Mullowney 551e057df02SPaul Mullowney /* decrement the offset */ 5529ae82921SPaul Mullowney offset -= (nz+1); 5539ae82921SPaul Mullowney 554e057df02SPaul Mullowney /* first, set the diagonal elements */ 5559ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 55609f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5579ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5589ae82921SPaul Mullowney 559580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 560580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5619ae82921SPaul Mullowney } 5622205254eSKarl Rupp 563aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 564da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 565da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5662205254eSKarl Rupp 567aa372e3fSPaul Mullowney /* Create the matrix description */ 56857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 56957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 572afb2bd1cSJunchao Zhang #else 57357d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 574afb2bd1cSJunchao Zhang #endif 57557d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 57657d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 577aa372e3fSPaul Mullowney 578aa372e3fSPaul Mullowney /* set the operation */ 579aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580aa372e3fSPaul Mullowney 581aa372e3fSPaul Mullowney /* set the matrix */ 582aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 583aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 584aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 585aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 586aa372e3fSPaul Mullowney 587aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589aa372e3fSPaul Mullowney 590aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592aa372e3fSPaul Mullowney 593aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595aa372e3fSPaul Mullowney 596afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 597da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 598afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 605afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 606afb2bd1cSJunchao Zhang #endif 607afb2bd1cSJunchao Zhang 608aa372e3fSPaul Mullowney /* perform the solve analysis */ 609aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 6131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 615afb2bd1cSJunchao Zhang #endif 616afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 617da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 618da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 619aa372e3fSPaul Mullowney 620da79fbbcSStefano Zampini /* assign the pointer */ 621aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6222cbc15d9SMark upTriFactor->AA_h = AAUp; 62357d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 62457d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6254863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 626da79fbbcSStefano Zampini } else { 6272cbc15d9SMark if (!upTriFactor->AA_h) { 6282cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6292cbc15d9SMark } 630da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 631da79fbbcSStefano Zampini offset = nzUpper; 632da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 633da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 634da79fbbcSStefano Zampini 635da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 636da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 637da79fbbcSStefano Zampini 638da79fbbcSStefano Zampini /* decrement the offset */ 639da79fbbcSStefano Zampini offset -= (nz+1); 640da79fbbcSStefano Zampini 641da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6422cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6432cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 644da79fbbcSStefano Zampini } 6452cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 646da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 647da79fbbcSStefano Zampini } 6489ae82921SPaul Mullowney } catch(char *ex) { 6499ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6509ae82921SPaul Mullowney } 6519ae82921SPaul Mullowney } 6529ae82921SPaul Mullowney PetscFunctionReturn(0); 6539ae82921SPaul Mullowney } 6549ae82921SPaul Mullowney 655087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6569ae82921SPaul Mullowney { 6579ae82921SPaul Mullowney PetscErrorCode ierr; 6589ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6599ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6609ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6619ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6629ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6639ae82921SPaul Mullowney 6649ae82921SPaul Mullowney PetscFunctionBegin; 665da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 666087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 667087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6682205254eSKarl Rupp 669da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 670aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6719ae82921SPaul Mullowney 672c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 673e057df02SPaul Mullowney /* lower triangular indices */ 6749ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 675da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 676da79fbbcSStefano Zampini const PetscInt *r; 677da79fbbcSStefano Zampini 678da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 679aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 680aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6819ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 682da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 683da79fbbcSStefano Zampini } 6849ae82921SPaul Mullowney 685e057df02SPaul Mullowney /* upper triangular indices */ 6869ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 688da79fbbcSStefano Zampini const PetscInt *c; 689da79fbbcSStefano Zampini 690da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 691aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 692aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6939ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 694da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 695da79fbbcSStefano Zampini } 6969ae82921SPaul Mullowney PetscFunctionReturn(0); 6979ae82921SPaul Mullowney } 6989ae82921SPaul Mullowney 699087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 700087f3262SPaul Mullowney { 701087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 703aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 704aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 705087f3262SPaul Mullowney cusparseStatus_t stat; 706087f3262SPaul Mullowney PetscErrorCode ierr; 70757d48284SJunchao Zhang cudaError_t cerr; 708087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 709087f3262SPaul Mullowney PetscScalar *AAUp; 710087f3262SPaul Mullowney PetscScalar *AALo; 711087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 712087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 713087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 714087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 715087f3262SPaul Mullowney 716087f3262SPaul Mullowney PetscFunctionBegin; 717cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 718c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 719087f3262SPaul Mullowney try { 720da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 722da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 723087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 72457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 72557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 726087f3262SPaul Mullowney 727087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 728087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 729087f3262SPaul Mullowney AiUp[n]=nzUpper; 730087f3262SPaul Mullowney offset = 0; 731087f3262SPaul Mullowney for (i=0; i<n; i++) { 732087f3262SPaul Mullowney /* set the pointers */ 733087f3262SPaul Mullowney v = aa + ai[i]; 734087f3262SPaul Mullowney vj = aj + ai[i]; 735087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 736087f3262SPaul Mullowney 737087f3262SPaul Mullowney /* first, set the diagonal elements */ 738087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 73909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 740087f3262SPaul Mullowney AiUp[i] = offset; 74109f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 742087f3262SPaul Mullowney 743087f3262SPaul Mullowney offset+=1; 744087f3262SPaul Mullowney if (nz>0) { 745f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 746580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 747087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 748087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 749087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 750087f3262SPaul Mullowney } 751087f3262SPaul Mullowney offset+=nz; 752087f3262SPaul Mullowney } 753087f3262SPaul Mullowney } 754087f3262SPaul Mullowney 755aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 756da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 757da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 758087f3262SPaul Mullowney 759aa372e3fSPaul Mullowney /* Create the matrix description */ 76057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 76157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 764afb2bd1cSJunchao Zhang #else 76557d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 766afb2bd1cSJunchao Zhang #endif 76757d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 76857d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 769087f3262SPaul Mullowney 770aa372e3fSPaul Mullowney /* set the matrix */ 771aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 772aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 773aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 774aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 775aa372e3fSPaul Mullowney 776aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 777aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 778aa372e3fSPaul Mullowney 779aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 780aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 781aa372e3fSPaul Mullowney 782aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 783aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 784aa372e3fSPaul Mullowney 785afb2bd1cSJunchao Zhang /* set the operation */ 786afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 787afb2bd1cSJunchao Zhang 788afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 789da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 790afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7911b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 793afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 794afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 795afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 796afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 797afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 798afb2bd1cSJunchao Zhang #endif 799afb2bd1cSJunchao Zhang 800aa372e3fSPaul Mullowney /* perform the solve analysis */ 801aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 802aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 803aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 804afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 8051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 807afb2bd1cSJunchao Zhang #endif 808afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 809da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 810da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 811aa372e3fSPaul Mullowney 812da79fbbcSStefano Zampini /* assign the pointer */ 813aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814aa372e3fSPaul Mullowney 815aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 816da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 817da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818aa372e3fSPaul Mullowney 819aa372e3fSPaul Mullowney /* Create the matrix description */ 82057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 82157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 824afb2bd1cSJunchao Zhang #else 82557d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 826afb2bd1cSJunchao Zhang #endif 82757d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 82857d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 829aa372e3fSPaul Mullowney 830aa372e3fSPaul Mullowney /* set the operation */ 831aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832aa372e3fSPaul Mullowney 833aa372e3fSPaul Mullowney /* set the matrix */ 834aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 835aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 836aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 837aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 838aa372e3fSPaul Mullowney 839aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841aa372e3fSPaul Mullowney 842aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847aa372e3fSPaul Mullowney 848afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 849da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 850afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 857afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 858afb2bd1cSJunchao Zhang #endif 859afb2bd1cSJunchao Zhang 860aa372e3fSPaul Mullowney /* perform the solve analysis */ 861aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 8651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 867afb2bd1cSJunchao Zhang #endif 868afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 869da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 870da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 871aa372e3fSPaul Mullowney 872da79fbbcSStefano Zampini /* assign the pointer */ 873aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 874087f3262SPaul Mullowney 875da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 87657d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 87757d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 878da79fbbcSStefano Zampini } else { 879da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 880da79fbbcSStefano Zampini offset = 0; 881da79fbbcSStefano Zampini for (i=0; i<n; i++) { 882da79fbbcSStefano Zampini /* set the pointers */ 883da79fbbcSStefano Zampini v = aa + ai[i]; 884da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 885da79fbbcSStefano Zampini 886da79fbbcSStefano Zampini /* first, set the diagonal elements */ 887da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 888da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 889da79fbbcSStefano Zampini 890da79fbbcSStefano Zampini offset+=1; 891da79fbbcSStefano Zampini if (nz>0) { 892da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 893da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 894da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 895da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 896da79fbbcSStefano Zampini } 897da79fbbcSStefano Zampini offset+=nz; 898da79fbbcSStefano Zampini } 899da79fbbcSStefano Zampini } 900da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 902da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 903da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 904da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 905da79fbbcSStefano Zampini } 90657d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 90757d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 908087f3262SPaul Mullowney } catch(char *ex) { 909087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 910087f3262SPaul Mullowney } 911087f3262SPaul Mullowney } 912087f3262SPaul Mullowney PetscFunctionReturn(0); 913087f3262SPaul Mullowney } 914087f3262SPaul Mullowney 915087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 9169ae82921SPaul Mullowney { 9179ae82921SPaul Mullowney PetscErrorCode ierr; 918087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 919087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 920087f3262SPaul Mullowney IS ip = a->row; 921087f3262SPaul Mullowney PetscBool perm_identity; 922087f3262SPaul Mullowney PetscInt n = A->rmap->n; 923087f3262SPaul Mullowney 924087f3262SPaul Mullowney PetscFunctionBegin; 925da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 926087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 927da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 928aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 929aa372e3fSPaul Mullowney 930da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 931da79fbbcSStefano Zampini 932087f3262SPaul Mullowney /* lower triangular indices */ 933087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 934087f3262SPaul Mullowney if (!perm_identity) { 9354e4bbfaaSStefano Zampini IS iip; 936da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9374e4bbfaaSStefano Zampini 9384e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9394e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 940da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 941aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 942aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 943aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9444e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9454e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9464e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 947087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 948da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 949da79fbbcSStefano Zampini } 950087f3262SPaul Mullowney PetscFunctionReturn(0); 951087f3262SPaul Mullowney } 952087f3262SPaul Mullowney 953*bddcd29dSMark Adams #define CHECK_LAUNCH_ERROR() \ 954*bddcd29dSMark Adams do { \ 955*bddcd29dSMark Adams /* Check synchronous errors, i.e. pre-launch */ \ 956*bddcd29dSMark Adams cudaError_t err = cudaGetLastError(); \ 957*bddcd29dSMark Adams if (cudaSuccess != err) { \ 958*bddcd29dSMark Adams SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 959*bddcd29dSMark Adams } \ 960*bddcd29dSMark Adams /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 961*bddcd29dSMark Adams err = cudaDeviceSynchronize(); \ 962*bddcd29dSMark Adams if (cudaSuccess != err) { \ 963*bddcd29dSMark Adams SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 964*bddcd29dSMark Adams } \ 965*bddcd29dSMark Adams } while (0) 9669ae82921SPaul Mullowney 967087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 968087f3262SPaul Mullowney { 969087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 970087f3262SPaul Mullowney IS ip = b->row; 971087f3262SPaul Mullowney PetscBool perm_identity; 972b175d8bbSPaul Mullowney PetscErrorCode ierr; 973087f3262SPaul Mullowney 974087f3262SPaul Mullowney PetscFunctionBegin; 97557181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 976087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 977ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 978087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 979087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 980087f3262SPaul Mullowney if (perm_identity) { 981087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 982087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9834e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9844e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 985087f3262SPaul Mullowney } else { 986087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 987087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9884e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9894e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 990087f3262SPaul Mullowney } 991087f3262SPaul Mullowney 992087f3262SPaul Mullowney /* get the triangular factors */ 993087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 994087f3262SPaul Mullowney PetscFunctionReturn(0); 995087f3262SPaul Mullowney } 9969ae82921SPaul Mullowney 997b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 998bda325fcSPaul Mullowney { 999bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1001aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1002da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1003da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1004bda325fcSPaul Mullowney cusparseStatus_t stat; 1005aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1006aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1007aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1008aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10091b0a6780SStefano Zampini cudaError_t cerr; 1010da79fbbcSStefano Zampini PetscErrorCode ierr; 1011b175d8bbSPaul Mullowney 1012bda325fcSPaul Mullowney PetscFunctionBegin; 1013aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1014da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1015da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1016aa372e3fSPaul Mullowney 1017aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1018aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1019aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1020aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1021aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1022aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1023aa372e3fSPaul Mullowney 1024aa372e3fSPaul Mullowney /* Create the matrix description */ 102557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 102657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 102757d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 102857d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 102957d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1030aa372e3fSPaul Mullowney 1031aa372e3fSPaul Mullowney /* set the operation */ 1032aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1033aa372e3fSPaul Mullowney 1034aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1035aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1036afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1037afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1038aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1039afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1040afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1041afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1042aa372e3fSPaul Mullowney 1043aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1044afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1048afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1049afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1050afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1051afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1053afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10541b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1055afb2bd1cSJunchao Zhang #endif 1056afb2bd1cSJunchao Zhang 1057da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1058aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1059aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1060aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1061aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1062aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1063aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1064afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1066afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1067afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1068afb2bd1cSJunchao Zhang #else 1069afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1070afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1071afb2bd1cSJunchao Zhang #endif 1072afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1073da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1074da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1075aa372e3fSPaul Mullowney 1076afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1077da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1078afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1082afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1083afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1084afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1085afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1086afb2bd1cSJunchao Zhang #endif 1087afb2bd1cSJunchao Zhang 1088afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1089aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1091afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1092afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 10931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1094afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1095afb2bd1cSJunchao Zhang #endif 1096afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1097da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1098da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1099aa372e3fSPaul Mullowney 1100da79fbbcSStefano Zampini /* assign the pointer */ 1101aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1102aa372e3fSPaul Mullowney 1103aa372e3fSPaul Mullowney /*********************************************/ 1104aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1105aa372e3fSPaul Mullowney /*********************************************/ 1106aa372e3fSPaul Mullowney 1107aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1108da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1109da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1110aa372e3fSPaul Mullowney 1111aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1112aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1113aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1114aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1115aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1116aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1117aa372e3fSPaul Mullowney 1118aa372e3fSPaul Mullowney /* Create the matrix description */ 111957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 112057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 112157d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 112257d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 112357d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1124aa372e3fSPaul Mullowney 1125aa372e3fSPaul Mullowney /* set the operation */ 1126aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1127aa372e3fSPaul Mullowney 1128aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1129aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1130afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1131afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1132aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1133afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1134afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1135afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1136aa372e3fSPaul Mullowney 1137aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1138afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1140afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1142afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1143afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1144afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1145afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1147afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1148afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1149afb2bd1cSJunchao Zhang #endif 1150afb2bd1cSJunchao Zhang 1151da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1152aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1153aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1154aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1155aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1156aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1157aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1158afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1159afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1160afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1161afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1162afb2bd1cSJunchao Zhang #else 1163afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1165afb2bd1cSJunchao Zhang #endif 1166afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1167da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1168da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1169aa372e3fSPaul Mullowney 1170afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1171da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1172afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11731b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1175afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1176afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1177afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1178afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1179afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1180afb2bd1cSJunchao Zhang #endif 1181afb2bd1cSJunchao Zhang 1182afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1183aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1184afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1185afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1186afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 11871b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1188afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1189afb2bd1cSJunchao Zhang #endif 1190afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1191da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193aa372e3fSPaul Mullowney 1194da79fbbcSStefano Zampini /* assign the pointer */ 1195aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196bda325fcSPaul Mullowney PetscFunctionReturn(0); 1197bda325fcSPaul Mullowney } 1198bda325fcSPaul Mullowney 1199a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1200a49f1ed0SStefano Zampini { 1201a49f1ed0SStefano Zampini __host__ __device__ 1202a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1203a49f1ed0SStefano Zampini { 1204a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1205a49f1ed0SStefano Zampini } 1206a49f1ed0SStefano Zampini }; 1207a49f1ed0SStefano Zampini 12081a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1209bda325fcSPaul Mullowney { 1210aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213bda325fcSPaul Mullowney cusparseStatus_t stat; 1214aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1215b06137fdSPaul Mullowney cudaError_t err; 121685ba7357SStefano Zampini PetscErrorCode ierr; 1217b175d8bbSPaul Mullowney 1218bda325fcSPaul Mullowney PetscFunctionBegin; 12191a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1220a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1221a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1222a49f1ed0SStefano Zampini if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1223a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 12241a2c6b5cSJunchao Zhang if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 12251a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 122685ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1227a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229a49f1ed0SStefano Zampini } 1230a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 123257d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 123457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 123557d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236aa372e3fSPaul Mullowney 1237b06137fdSPaul Mullowney /* set alpha and beta */ 1238afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12397656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12407656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12427656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12437656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244b06137fdSPaul Mullowney 1245aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1247a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1248554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1249554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1250aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1251a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1254a3fdcf43SKarl Rupp 1255039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 125681902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257afb2bd1cSJunchao Zhang 1258afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1260afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1261afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1262afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1263afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1264afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1265afb2bd1cSJunchao Zhang #endif 1266aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1267afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1269afb2bd1cSJunchao Zhang #else 1270aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 127151c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 127251c6d536SStefano Zampini /* First convert HYB to CSR */ 1273aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1274aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1275aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1276aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1277aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1278aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1279aa372e3fSPaul Mullowney 1280aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1281aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1282aa372e3fSPaul Mullowney temp->values->data().get(), 1283aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 128457d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1285aa372e3fSPaul Mullowney 1286aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1287aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1288aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1289aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1290aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1292aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1293aa372e3fSPaul Mullowney 1294aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1295aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1296aa372e3fSPaul Mullowney temp->values->data().get(), 1297aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1298aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1299aa372e3fSPaul Mullowney tempT->values->data().get(), 1300aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1301aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 130257d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1303aa372e3fSPaul Mullowney 1304aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1305aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 130657d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1307aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1308aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1309aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1310aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1311aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1312aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 131357d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1314aa372e3fSPaul Mullowney 1315aa372e3fSPaul Mullowney /* assign the pointer */ 1316aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13171a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1318aa372e3fSPaul Mullowney /* delete temporaries */ 1319aa372e3fSPaul Mullowney if (tempT) { 1320aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1321aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1322aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1323aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1324087f3262SPaul Mullowney } 1325aa372e3fSPaul Mullowney if (temp) { 1326aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1327aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1328aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1329aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1330aa372e3fSPaul Mullowney } 1331afb2bd1cSJunchao Zhang #endif 1332aa372e3fSPaul Mullowney } 1333a49f1ed0SStefano Zampini } 1334a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1335a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1336a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1337a49f1ed0SStefano Zampini if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1338a49f1ed0SStefano Zampini if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1339a49f1ed0SStefano Zampini if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1340a49f1ed0SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1341a49f1ed0SStefano Zampini if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1342a49f1ed0SStefano Zampini if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1343a49f1ed0SStefano Zampini if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1344a49f1ed0SStefano Zampini if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1345a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1346a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1347a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1348a49f1ed0SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1349a49f1ed0SStefano Zampini } 1350a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1351a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1352a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1353a49f1ed0SStefano Zampini 1354a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1355a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1356a49f1ed0SStefano Zampini void *csr2cscBuffer; 1357a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1358a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1359a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1360a49f1ed0SStefano Zampini matrix->values->data().get(), 1361a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1362a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1363a49f1ed0SStefano Zampini matrixT->values->data().get(), 1364a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1365a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 1366a49f1ed0SStefano Zampini cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1367a49f1ed0SStefano Zampini err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1368a49f1ed0SStefano Zampini #endif 1369a49f1ed0SStefano Zampini 13701a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13711a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13721a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13731a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13741a2c6b5cSJunchao Zhang 13751a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13761a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13771a2c6b5cSJunchao Zhang */ 13781a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 13791a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 13801a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 13811a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 13821a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1383a49f1ed0SStefano Zampini matrixT->values->data().get(), 1384a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1385a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1386a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13871a2c6b5cSJunchao Zhang cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1388a49f1ed0SStefano Zampini #else 1389a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 13901a2c6b5cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1391a49f1ed0SStefano Zampini #endif 13921a2c6b5cSJunchao Zhang } else { 13931a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13941a2c6b5cSJunchao Zhang } 13951a2c6b5cSJunchao Zhang 1396a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1397a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1398a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1399a49f1ed0SStefano Zampini err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1400a49f1ed0SStefano Zampini #endif 1401a49f1ed0SStefano Zampini } 1402a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1403a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1404a49f1ed0SStefano Zampini matrixT->values->begin())); 1405a49f1ed0SStefano Zampini } 140685ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1407213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1408213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1409aa372e3fSPaul Mullowney /* assign the pointer */ 1410aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 14111a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1412bda325fcSPaul Mullowney PetscFunctionReturn(0); 1413bda325fcSPaul Mullowney } 1414bda325fcSPaul Mullowney 1415a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 14166fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1417bda325fcSPaul Mullowney { 1418c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1419465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1420465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1421465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1422465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1423bda325fcSPaul Mullowney cusparseStatus_t stat; 1424bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1425aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1426aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1427aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1428b175d8bbSPaul Mullowney PetscErrorCode ierr; 142957d48284SJunchao Zhang cudaError_t cerr; 1430bda325fcSPaul Mullowney 1431bda325fcSPaul Mullowney PetscFunctionBegin; 1432aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1433aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1434bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1435aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1436aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1437bda325fcSPaul Mullowney } 1438bda325fcSPaul Mullowney 1439bda325fcSPaul Mullowney /* Get the GPU pointers */ 1440c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1441c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1442c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1443c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1444bda325fcSPaul Mullowney 14457a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1446aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1447a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1448c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1449c41cb2e2SAlejandro Lamas Daviña xGPU); 1450aa372e3fSPaul Mullowney 1451aa372e3fSPaul Mullowney /* First, solve U */ 1452aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1453afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1455afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1456afb2bd1cSJunchao Zhang #endif 1457afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1458aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1459aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1460aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1461aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1462afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 14631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1464afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1465afb2bd1cSJunchao Zhang #endif 1466afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1467aa372e3fSPaul Mullowney 1468aa372e3fSPaul Mullowney /* Then, solve L */ 1469aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1473afb2bd1cSJunchao Zhang #endif 1474afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1476aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1477aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1478aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1479afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1482afb2bd1cSJunchao Zhang #endif 1483afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1484aa372e3fSPaul Mullowney 1485aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1486a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1487c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1488aa372e3fSPaul Mullowney tempGPU->begin()); 1489aa372e3fSPaul Mullowney 1490aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1491a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1492bda325fcSPaul Mullowney 1493bda325fcSPaul Mullowney /* restore */ 1494c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1495c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 149605035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1497661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1498958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1499bda325fcSPaul Mullowney PetscFunctionReturn(0); 1500bda325fcSPaul Mullowney } 1501bda325fcSPaul Mullowney 15026fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1503bda325fcSPaul Mullowney { 1504465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1505465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1506bda325fcSPaul Mullowney cusparseStatus_t stat; 1507bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1508aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1509aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1510aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1511b175d8bbSPaul Mullowney PetscErrorCode ierr; 151257d48284SJunchao Zhang cudaError_t cerr; 1513bda325fcSPaul Mullowney 1514bda325fcSPaul Mullowney PetscFunctionBegin; 1515aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1516aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1517bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520bda325fcSPaul Mullowney } 1521bda325fcSPaul Mullowney 1522bda325fcSPaul Mullowney /* Get the GPU pointers */ 1523c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525bda325fcSPaul Mullowney 15267a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527aa372e3fSPaul Mullowney /* First, solve U */ 1528aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 15301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1532afb2bd1cSJunchao Zhang #endif 1533afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1535aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1536aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1537aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1538afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 15391b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1541afb2bd1cSJunchao Zhang #endif 1542afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1543aa372e3fSPaul Mullowney 1544aa372e3fSPaul Mullowney /* Then, solve L */ 1545aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1546afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 15471b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1549afb2bd1cSJunchao Zhang #endif 1550afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1551aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1552aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1553aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1554aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1555afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 15561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1557afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1558afb2bd1cSJunchao Zhang #endif 1559afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1560bda325fcSPaul Mullowney 1561bda325fcSPaul Mullowney /* restore */ 1562c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1563c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 156405035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1565661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1566958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1567bda325fcSPaul Mullowney PetscFunctionReturn(0); 1568bda325fcSPaul Mullowney } 1569bda325fcSPaul Mullowney 15706fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15719ae82921SPaul Mullowney { 1572465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1573465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1574465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1575465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15769ae82921SPaul Mullowney cusparseStatus_t stat; 15779ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1578aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1579aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1580aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1581b175d8bbSPaul Mullowney PetscErrorCode ierr; 158257d48284SJunchao Zhang cudaError_t cerr; 15839ae82921SPaul Mullowney 15849ae82921SPaul Mullowney PetscFunctionBegin; 1585ebc8f436SDominic Meiser 1586e057df02SPaul Mullowney /* Get the GPU pointers */ 1587c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1588c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1589c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1590c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15919ae82921SPaul Mullowney 15927a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1593aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1594a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1595c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15964e4bbfaaSStefano Zampini tempGPU->begin()); 1597aa372e3fSPaul Mullowney 1598aa372e3fSPaul Mullowney /* Next, solve L */ 1599aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1600afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16011b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1602afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1603afb2bd1cSJunchao Zhang #endif 1604afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1605aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1606aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1607aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1608aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1609afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 16101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1612afb2bd1cSJunchao Zhang #endif 1613afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1614aa372e3fSPaul Mullowney 1615aa372e3fSPaul Mullowney /* Then, solve U */ 1616aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1617afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16181b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1619afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1620afb2bd1cSJunchao Zhang #endif 1621afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1622aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1623aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1624aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1625aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1626afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 16271b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1629afb2bd1cSJunchao Zhang #endif 1630afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1631aa372e3fSPaul Mullowney 16324e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1633a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 16344e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 16354e4bbfaaSStefano Zampini xGPU); 16369ae82921SPaul Mullowney 1637c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1638c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 163905035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1640661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1641958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16429ae82921SPaul Mullowney PetscFunctionReturn(0); 16439ae82921SPaul Mullowney } 16449ae82921SPaul Mullowney 16456fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 16469ae82921SPaul Mullowney { 1647465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1648465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16499ae82921SPaul Mullowney cusparseStatus_t stat; 16509ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1651aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1652aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1653aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1654b175d8bbSPaul Mullowney PetscErrorCode ierr; 165557d48284SJunchao Zhang cudaError_t cerr; 16569ae82921SPaul Mullowney 16579ae82921SPaul Mullowney PetscFunctionBegin; 1658e057df02SPaul Mullowney /* Get the GPU pointers */ 1659c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1660c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 16619ae82921SPaul Mullowney 16627a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1663aa372e3fSPaul Mullowney /* First, solve L */ 1664aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1665afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16661b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1667afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1668afb2bd1cSJunchao Zhang #endif 1669afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1670aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1671aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1672aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1673aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1674afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 16751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1676afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1677afb2bd1cSJunchao Zhang #endif 1678afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1679aa372e3fSPaul Mullowney 1680aa372e3fSPaul Mullowney /* Next, solve U */ 1681aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1682afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16831b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1684afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1685afb2bd1cSJunchao Zhang #endif 1686afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1687aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1688aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1689aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1690aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1691afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 16921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1693afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1694afb2bd1cSJunchao Zhang #endif 1695afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 16969ae82921SPaul Mullowney 1697c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1698c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 169905035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1700661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1701958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 17029ae82921SPaul Mullowney PetscFunctionReturn(0); 17039ae82921SPaul Mullowney } 17049ae82921SPaul Mullowney 17057e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17067e8381f9SStefano Zampini { 17077e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17087e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17097e8381f9SStefano Zampini cudaError_t cerr; 17107e8381f9SStefano Zampini PetscErrorCode ierr; 17117e8381f9SStefano Zampini 17127e8381f9SStefano Zampini PetscFunctionBegin; 17137e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17147e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17157e8381f9SStefano Zampini 17167e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17177e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 17187e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 17197e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 17207e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 17217e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17227e8381f9SStefano Zampini } 17237e8381f9SStefano Zampini PetscFunctionReturn(0); 17247e8381f9SStefano Zampini } 17257e8381f9SStefano Zampini 17267e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17277e8381f9SStefano Zampini { 17287e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17297e8381f9SStefano Zampini PetscErrorCode ierr; 17307e8381f9SStefano Zampini 17317e8381f9SStefano Zampini PetscFunctionBegin; 17327e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 17337e8381f9SStefano Zampini *array = a->a; 17347e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 17357e8381f9SStefano Zampini PetscFunctionReturn(0); 17367e8381f9SStefano Zampini } 17377e8381f9SStefano Zampini 17386fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 17399ae82921SPaul Mullowney { 1740aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 17417c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 17429ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1743213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 17449ae82921SPaul Mullowney PetscErrorCode ierr; 1745aa372e3fSPaul Mullowney cusparseStatus_t stat; 1746abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 1747b06137fdSPaul Mullowney cudaError_t err; 17489ae82921SPaul Mullowney 17499ae82921SPaul Mullowney PetscFunctionBegin; 1750fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1751c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1752a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1753a49f1ed0SStefano Zampini CsrMatrix *matrix; 1754afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 175585ba7357SStefano Zampini 1756abb89eb1SStefano Zampini if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 175785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1758afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 175905035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 17604863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 176185ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1762a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 176334d6c7a5SJose E. Roman } else { 1764abb89eb1SStefano Zampini PetscInt nnz; 176585ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17667c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1767a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 17687c700b8dSJunchao Zhang delete cusparsestruct->workVector; 176981902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1770a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1771a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 17729ae82921SPaul Mullowney try { 17739ae82921SPaul Mullowney if (a->compressedrow.use) { 17749ae82921SPaul Mullowney m = a->compressedrow.nrows; 17759ae82921SPaul Mullowney ii = a->compressedrow.i; 17769ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 17779ae82921SPaul Mullowney } else { 1778213423ffSJunchao Zhang m = A->rmap->n; 1779213423ffSJunchao Zhang ii = a->i; 1780e6e9a74fSStefano Zampini ridx = NULL; 17819ae82921SPaul Mullowney } 1782abb89eb1SStefano Zampini if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1783abb89eb1SStefano Zampini if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1784abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1785abb89eb1SStefano Zampini else nnz = a->nz; 17869ae82921SPaul Mullowney 178785ba7357SStefano Zampini /* create cusparse matrix */ 1788abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1789aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 179057d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 179157d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 179257d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 17939ae82921SPaul Mullowney 1794afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 17957656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 17967656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1797afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17987656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17997656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 180057d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1801b06137fdSPaul Mullowney 1802aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1803aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1804aa372e3fSPaul Mullowney /* set the matrix */ 1805afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1806afb2bd1cSJunchao Zhang mat->num_rows = m; 1807afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1808abb89eb1SStefano Zampini mat->num_entries = nnz; 1809afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1810afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18119ae82921SPaul Mullowney 1812abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1813abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1814aa372e3fSPaul Mullowney 1815abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1816abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1817aa372e3fSPaul Mullowney 1818aa372e3fSPaul Mullowney /* assign the pointer */ 1819afb2bd1cSJunchao Zhang matstruct->mat = mat; 1820afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1821afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1822afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1823afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1824afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1825afb2bd1cSJunchao Zhang mat->values->data().get(), 1826afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1827afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1828afb2bd1cSJunchao Zhang } 1829afb2bd1cSJunchao Zhang #endif 1830aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1831afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1832afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1833afb2bd1cSJunchao Zhang #else 1834afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1835afb2bd1cSJunchao Zhang mat->num_rows = m; 1836afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1837abb89eb1SStefano Zampini mat->num_entries = nnz; 1838afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1839afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1840aa372e3fSPaul Mullowney 1841abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1842abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1843aa372e3fSPaul Mullowney 1844abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1845abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1846aa372e3fSPaul Mullowney 1847aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 184857d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1849aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1850aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1851afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1852afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1853afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1854afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 185557d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1856aa372e3fSPaul Mullowney /* assign the pointer */ 1857aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1858aa372e3fSPaul Mullowney 1859afb2bd1cSJunchao Zhang if (mat) { 1860afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1861afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1862afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1863afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1864087f3262SPaul Mullowney } 1865afb2bd1cSJunchao Zhang #endif 1866087f3262SPaul Mullowney } 1867ca45077fSPaul Mullowney 1868aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1869213423ffSJunchao Zhang if (a->compressedrow.use) { 1870213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1871aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1872aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1873213423ffSJunchao Zhang tmp = m; 1874213423ffSJunchao Zhang } else { 1875213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1876213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1877213423ffSJunchao Zhang tmp = 0; 1878213423ffSJunchao Zhang } 1879213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1880aa372e3fSPaul Mullowney 1881aa372e3fSPaul Mullowney /* assign the pointer */ 1882aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 18839ae82921SPaul Mullowney } catch(char *ex) { 18849ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18859ae82921SPaul Mullowney } 188605035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 188785ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 188834d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 188934d6c7a5SJose E. Roman } 1890abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 18919ae82921SPaul Mullowney } 18929ae82921SPaul Mullowney PetscFunctionReturn(0); 18939ae82921SPaul Mullowney } 18949ae82921SPaul Mullowney 1895c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1896aa372e3fSPaul Mullowney { 1897aa372e3fSPaul Mullowney template <typename Tuple> 1898aa372e3fSPaul Mullowney __host__ __device__ 1899aa372e3fSPaul Mullowney void operator()(Tuple t) 1900aa372e3fSPaul Mullowney { 1901aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1902aa372e3fSPaul Mullowney } 1903aa372e3fSPaul Mullowney }; 1904aa372e3fSPaul Mullowney 19057e8381f9SStefano Zampini struct VecCUDAEquals 19067e8381f9SStefano Zampini { 19077e8381f9SStefano Zampini template <typename Tuple> 19087e8381f9SStefano Zampini __host__ __device__ 19097e8381f9SStefano Zampini void operator()(Tuple t) 19107e8381f9SStefano Zampini { 19117e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19127e8381f9SStefano Zampini } 19137e8381f9SStefano Zampini }; 19147e8381f9SStefano Zampini 1915e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1916e6e9a74fSStefano Zampini { 1917e6e9a74fSStefano Zampini template <typename Tuple> 1918e6e9a74fSStefano Zampini __host__ __device__ 1919e6e9a74fSStefano Zampini void operator()(Tuple t) 1920e6e9a74fSStefano Zampini { 1921e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1922e6e9a74fSStefano Zampini } 1923e6e9a74fSStefano Zampini }; 1924e6e9a74fSStefano Zampini 1925afb2bd1cSJunchao Zhang struct MatMatCusparse { 1926ccdfe979SStefano Zampini PetscBool cisdense; 1927ccdfe979SStefano Zampini PetscScalar *Bt; 1928ccdfe979SStefano Zampini Mat X; 1929fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1930fcdce8c4SStefano Zampini PetscLogDouble flops; 1931fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 1932afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1933fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 1934afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1935afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1936afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1937afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1938fcdce8c4SStefano Zampini size_t mmBufferSize; 1939fcdce8c4SStefano Zampini void *mmBuffer; 1940fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1941fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 1942afb2bd1cSJunchao Zhang #endif 1943afb2bd1cSJunchao Zhang }; 1944ccdfe979SStefano Zampini 1945ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1946ccdfe979SStefano Zampini { 1947ccdfe979SStefano Zampini PetscErrorCode ierr; 1948ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1949ccdfe979SStefano Zampini cudaError_t cerr; 1950fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1951fcdce8c4SStefano Zampini cusparseStatus_t stat; 1952fcdce8c4SStefano Zampini #endif 1953ccdfe979SStefano Zampini 1954ccdfe979SStefano Zampini PetscFunctionBegin; 1955ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1956fcdce8c4SStefano Zampini delete mmdata->Bcsr; 1957afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1958fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1959fcdce8c4SStefano Zampini if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1960fcdce8c4SStefano Zampini if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1961afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1962afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1963fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1964afb2bd1cSJunchao Zhang #endif 1965ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1966ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1967ccdfe979SStefano Zampini PetscFunctionReturn(0); 1968ccdfe979SStefano Zampini } 1969ccdfe979SStefano Zampini 1970ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1971ccdfe979SStefano Zampini 1972ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1973ccdfe979SStefano Zampini { 1974ccdfe979SStefano Zampini Mat_Product *product = C->product; 1975ccdfe979SStefano Zampini Mat A,B; 1976afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1977ccdfe979SStefano Zampini PetscBool flg,biscuda; 1978ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1979ccdfe979SStefano Zampini cusparseStatus_t stat; 1980ccdfe979SStefano Zampini cusparseOperation_t opA; 1981ccdfe979SStefano Zampini const PetscScalar *barray; 1982ccdfe979SStefano Zampini PetscScalar *carray; 1983ccdfe979SStefano Zampini PetscErrorCode ierr; 1984ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1985ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1986ccdfe979SStefano Zampini CsrMatrix *csrmat; 1987afb2bd1cSJunchao Zhang cudaError_t cerr; 1988ccdfe979SStefano Zampini 1989ccdfe979SStefano Zampini PetscFunctionBegin; 1990ccdfe979SStefano Zampini MatCheckProduct(C,1); 1991ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1992ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1993ccdfe979SStefano Zampini A = product->A; 1994ccdfe979SStefano Zampini B = product->B; 1995ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1996ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1997ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1998ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1999ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2000ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2001ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2002ccdfe979SStefano Zampini switch (product->type) { 2003ccdfe979SStefano Zampini case MATPRODUCT_AB: 2004ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2005ccdfe979SStefano Zampini mat = cusp->mat; 2006ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2007ccdfe979SStefano Zampini m = A->rmap->n; 2008ccdfe979SStefano Zampini n = B->cmap->n; 2009ccdfe979SStefano Zampini break; 2010ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20111a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2012e6e9a74fSStefano Zampini mat = cusp->mat; 2013e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2014e6e9a74fSStefano Zampini } else { 20151a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2016ccdfe979SStefano Zampini mat = cusp->matTranspose; 2017ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2018e6e9a74fSStefano Zampini } 2019ccdfe979SStefano Zampini m = A->cmap->n; 2020ccdfe979SStefano Zampini n = B->cmap->n; 2021ccdfe979SStefano Zampini break; 2022ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2023ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2024ccdfe979SStefano Zampini mat = cusp->mat; 2025ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2026ccdfe979SStefano Zampini m = A->rmap->n; 2027ccdfe979SStefano Zampini n = B->rmap->n; 2028ccdfe979SStefano Zampini break; 2029ccdfe979SStefano Zampini default: 2030ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2031ccdfe979SStefano Zampini } 2032ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2033ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2034ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 2035ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2036afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2037ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2038afb2bd1cSJunchao Zhang 2039ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2040c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2041c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2042c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2043c8378d12SStefano Zampini } else { 2044c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2045c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2046c8378d12SStefano Zampini } 2047c8378d12SStefano Zampini 2048c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2049afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2050afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2051fcdce8c4SStefano Zampini /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2052afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2053fcdce8c4SStefano Zampini size_t mmBufferSize; 2054afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2055afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2056afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2057afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2058afb2bd1cSJunchao Zhang } 2059c8378d12SStefano Zampini 2060afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2061afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2062afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2063afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2064afb2bd1cSJunchao Zhang } 2065afb2bd1cSJunchao Zhang 2066afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2067afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2068afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2069afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2070afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2071afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2072afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2073afb2bd1cSJunchao Zhang } 2074afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2075afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2076afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2077fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2078fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2079fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2080fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2081fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2082fcdce8c4SStefano Zampini } 2083afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2084afb2bd1cSJunchao Zhang } else { 2085afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2086afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2087afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2088afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2089afb2bd1cSJunchao Zhang } 2090afb2bd1cSJunchao Zhang 2091afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2092afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2093afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2094afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2095fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2096afb2bd1cSJunchao Zhang #else 2097afb2bd1cSJunchao Zhang PetscInt k; 2098afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2099ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2100ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2101ccdfe979SStefano Zampini cublasStatus_t cerr; 2102ccdfe979SStefano Zampini 2103ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2104ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2105ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2106ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2107ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2108ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2109ccdfe979SStefano Zampini blda = B->cmap->n; 2110afb2bd1cSJunchao Zhang k = B->cmap->n; 2111afb2bd1cSJunchao Zhang } else { 2112afb2bd1cSJunchao Zhang k = B->rmap->n; 2113ccdfe979SStefano Zampini } 2114ccdfe979SStefano Zampini 2115afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2116ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2117afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2118ccdfe979SStefano Zampini csrmat->values->data().get(), 2119ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2120ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2121ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2122ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2123afb2bd1cSJunchao Zhang #endif 2124afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2125c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2126c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2127ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2128ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2129ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2130ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2131ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2132ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2133ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2134ccdfe979SStefano Zampini } else { 2135ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2136ccdfe979SStefano Zampini } 2137ccdfe979SStefano Zampini if (mmdata->cisdense) { 2138ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2139ccdfe979SStefano Zampini } 2140ccdfe979SStefano Zampini if (!biscuda) { 2141ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2142ccdfe979SStefano Zampini } 2143ccdfe979SStefano Zampini PetscFunctionReturn(0); 2144ccdfe979SStefano Zampini } 2145ccdfe979SStefano Zampini 2146ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2147ccdfe979SStefano Zampini { 2148ccdfe979SStefano Zampini Mat_Product *product = C->product; 2149ccdfe979SStefano Zampini Mat A,B; 2150ccdfe979SStefano Zampini PetscInt m,n; 2151ccdfe979SStefano Zampini PetscBool cisdense,flg; 2152ccdfe979SStefano Zampini PetscErrorCode ierr; 2153ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2154ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2155ccdfe979SStefano Zampini 2156ccdfe979SStefano Zampini PetscFunctionBegin; 2157ccdfe979SStefano Zampini MatCheckProduct(C,1); 2158ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2159ccdfe979SStefano Zampini A = product->A; 2160ccdfe979SStefano Zampini B = product->B; 2161ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2162ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2163ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2164ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2165ccdfe979SStefano Zampini switch (product->type) { 2166ccdfe979SStefano Zampini case MATPRODUCT_AB: 2167ccdfe979SStefano Zampini m = A->rmap->n; 2168ccdfe979SStefano Zampini n = B->cmap->n; 2169ccdfe979SStefano Zampini break; 2170ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2171ccdfe979SStefano Zampini m = A->cmap->n; 2172ccdfe979SStefano Zampini n = B->cmap->n; 2173ccdfe979SStefano Zampini break; 2174ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2175ccdfe979SStefano Zampini m = A->rmap->n; 2176ccdfe979SStefano Zampini n = B->rmap->n; 2177ccdfe979SStefano Zampini break; 2178ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2179ccdfe979SStefano Zampini m = B->cmap->n; 2180ccdfe979SStefano Zampini n = B->cmap->n; 2181ccdfe979SStefano Zampini break; 2182ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2183ccdfe979SStefano Zampini m = B->rmap->n; 2184ccdfe979SStefano Zampini n = B->rmap->n; 2185ccdfe979SStefano Zampini break; 2186ccdfe979SStefano Zampini default: 2187ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2188ccdfe979SStefano Zampini } 2189ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2190ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2191ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2192ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2193ccdfe979SStefano Zampini 2194ccdfe979SStefano Zampini /* product data */ 2195ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2196ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2197afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2198afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2199ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2200afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2201ccdfe979SStefano Zampini } 2202afb2bd1cSJunchao Zhang #endif 2203ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2204ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2205ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2206ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2207ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2208ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2209ccdfe979SStefano Zampini } else { 2210ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2211ccdfe979SStefano Zampini } 2212ccdfe979SStefano Zampini } 2213ccdfe979SStefano Zampini C->product->data = mmdata; 2214ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2215ccdfe979SStefano Zampini 2216ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2217ccdfe979SStefano Zampini PetscFunctionReturn(0); 2218ccdfe979SStefano Zampini } 2219ccdfe979SStefano Zampini 2220fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2221ccdfe979SStefano Zampini { 2222ccdfe979SStefano Zampini Mat_Product *product = C->product; 2223fcdce8c4SStefano Zampini Mat A,B; 2224fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2225fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2226fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2227fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2228fcdce8c4SStefano Zampini PetscBool flg; 2229ccdfe979SStefano Zampini PetscErrorCode ierr; 2230fcdce8c4SStefano Zampini cusparseStatus_t stat; 2231fcdce8c4SStefano Zampini cudaError_t cerr; 2232fcdce8c4SStefano Zampini MatProductType ptype; 2233fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2234fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2235fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2236fcdce8c4SStefano Zampini #endif 2237ccdfe979SStefano Zampini 2238ccdfe979SStefano Zampini PetscFunctionBegin; 2239ccdfe979SStefano Zampini MatCheckProduct(C,1); 2240fcdce8c4SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2241fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2242fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2243fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2244fcdce8c4SStefano Zampini A = product->A; 2245fcdce8c4SStefano Zampini B = product->B; 2246fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2247fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2248fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2249fcdce8c4SStefano Zampini if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2250fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2251fcdce8c4SStefano Zampini if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2252fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2253fcdce8c4SStefano Zampini if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2254fcdce8c4SStefano Zampini goto finalize; 2255fcdce8c4SStefano Zampini } 2256fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2257fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2258fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2259fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2260fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2261fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2262fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2263fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2264fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2265fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2266fcdce8c4SStefano Zampini if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2267fcdce8c4SStefano Zampini if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2268fcdce8c4SStefano Zampini if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2269fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2270fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2271fcdce8c4SStefano Zampini 2272fcdce8c4SStefano Zampini ptype = product->type; 2273fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2274fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2275fcdce8c4SStefano Zampini switch (ptype) { 2276fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2277fcdce8c4SStefano Zampini Amat = Acusp->mat; 2278fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2279fcdce8c4SStefano Zampini break; 2280fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2281fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2282fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2283fcdce8c4SStefano Zampini break; 2284fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2285fcdce8c4SStefano Zampini Amat = Acusp->mat; 2286fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2287fcdce8c4SStefano Zampini break; 2288fcdce8c4SStefano Zampini default: 2289fcdce8c4SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2290fcdce8c4SStefano Zampini } 2291fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2292fcdce8c4SStefano Zampini if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2293fcdce8c4SStefano Zampini if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2294fcdce8c4SStefano Zampini if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2295fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2296fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2297fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2298fcdce8c4SStefano Zampini if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2299fcdce8c4SStefano Zampini if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2300fcdce8c4SStefano Zampini if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2301fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2302fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2303fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2304fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2305fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2306fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2307fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2308fcdce8c4SStefano Zampini stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2309fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2310fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2311fcdce8c4SStefano Zampini #else 2312fcdce8c4SStefano Zampini stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2313fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2314fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2315fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2316fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2317fcdce8c4SStefano Zampini #endif 2318fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2319fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2320fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2321fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2322fcdce8c4SStefano Zampini finalize: 2323fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2324fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2325fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2326fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2327fcdce8c4SStefano Zampini c->reallocs = 0; 2328fcdce8c4SStefano Zampini C->info.mallocs += 0; 2329fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2330fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2331fcdce8c4SStefano Zampini C->num_ass++; 2332ccdfe979SStefano Zampini PetscFunctionReturn(0); 2333ccdfe979SStefano Zampini } 2334fcdce8c4SStefano Zampini 2335fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2336fcdce8c4SStefano Zampini { 2337fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2338fcdce8c4SStefano Zampini Mat A,B; 2339fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2340fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2341fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2342fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2343fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2344fcdce8c4SStefano Zampini PetscBool flg; 2345fcdce8c4SStefano Zampini PetscErrorCode ierr; 2346fcdce8c4SStefano Zampini cusparseStatus_t stat; 2347fcdce8c4SStefano Zampini cudaError_t cerr; 2348fcdce8c4SStefano Zampini MatProductType ptype; 2349fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2350fcdce8c4SStefano Zampini PetscLogDouble flops; 2351fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2352fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2353fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2354fcdce8c4SStefano Zampini size_t bufSize2; 2355fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2356fcdce8c4SStefano Zampini #else 2357fcdce8c4SStefano Zampini int cnz; 2358fcdce8c4SStefano Zampini #endif 2359fcdce8c4SStefano Zampini 2360fcdce8c4SStefano Zampini PetscFunctionBegin; 2361fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2362fcdce8c4SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2363fcdce8c4SStefano Zampini A = product->A; 2364fcdce8c4SStefano Zampini B = product->B; 2365fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2366fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2367fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2368fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2369fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2370fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2371fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2372fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2373fcdce8c4SStefano Zampini if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2374fcdce8c4SStefano Zampini if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2375fcdce8c4SStefano Zampini 2376fcdce8c4SStefano Zampini /* product data */ 2377fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2378fcdce8c4SStefano Zampini C->product->data = mmdata; 2379fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2380fcdce8c4SStefano Zampini 2381fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2382fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2383fcdce8c4SStefano Zampini ptype = product->type; 2384fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2385fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2386fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2387fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2388fcdce8c4SStefano Zampini switch (ptype) { 2389fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2390fcdce8c4SStefano Zampini m = A->rmap->n; 2391fcdce8c4SStefano Zampini n = B->cmap->n; 2392fcdce8c4SStefano Zampini k = A->cmap->n; 2393fcdce8c4SStefano Zampini Amat = Acusp->mat; 2394fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2395fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2396fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2397fcdce8c4SStefano Zampini break; 2398fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2399fcdce8c4SStefano Zampini m = A->cmap->n; 2400fcdce8c4SStefano Zampini n = B->cmap->n; 2401fcdce8c4SStefano Zampini k = A->rmap->n; 24021a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2403fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2404fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2405fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2406fcdce8c4SStefano Zampini break; 2407fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2408fcdce8c4SStefano Zampini m = A->rmap->n; 2409fcdce8c4SStefano Zampini n = B->rmap->n; 2410fcdce8c4SStefano Zampini k = A->cmap->n; 24111a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2412fcdce8c4SStefano Zampini Amat = Acusp->mat; 2413fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2414fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2415fcdce8c4SStefano Zampini break; 2416fcdce8c4SStefano Zampini default: 2417fcdce8c4SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2418fcdce8c4SStefano Zampini } 2419fcdce8c4SStefano Zampini 2420fcdce8c4SStefano Zampini /* create cusparse matrix */ 2421fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2422fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2423fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2424fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2425fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2426fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2427fcdce8c4SStefano Zampini 2428fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2429fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2430fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2431fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2432fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2433fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2434fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2435fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2436fcdce8c4SStefano Zampini } else { 2437fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2438fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2439fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2440fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2441fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2442fcdce8c4SStefano Zampini } 2443fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2444fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2445fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2446fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2447fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2448fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2449fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2450fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2451fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2452fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2453fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2454fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2455fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2456fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2457fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2458fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2459fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2460fcdce8c4SStefano Zampini c->nz = 0; 2461fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2462fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2463fcdce8c4SStefano Zampini goto finalizesym; 2464fcdce8c4SStefano Zampini } 2465fcdce8c4SStefano Zampini 2466fcdce8c4SStefano Zampini if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2467fcdce8c4SStefano Zampini if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2468fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2469fcdce8c4SStefano Zampini if (!biscompressed) { 2470fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2471fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2472fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2473fcdce8c4SStefano Zampini #endif 2474fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2475fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2476fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2477fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2478fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2479fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2480fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2481fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2482fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2483fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2484fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2485fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2486fcdce8c4SStefano Zampini } 2487fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2488fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2489fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2490fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2491fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2492fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2493fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2494fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2495fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2496fcdce8c4SStefano Zampini } 2497fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2498fcdce8c4SStefano Zampini #endif 2499fcdce8c4SStefano Zampini } 2500fcdce8c4SStefano Zampini if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2501fcdce8c4SStefano Zampini if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2502fcdce8c4SStefano Zampini /* precompute flops count */ 2503fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2504fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2505fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2506fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2507fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2508fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2509fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2510fcdce8c4SStefano Zampini } 2511fcdce8c4SStefano Zampini } 2512fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2513fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2514fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2515fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2516fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2517fcdce8c4SStefano Zampini } 2518fcdce8c4SStefano Zampini } else { /* TODO */ 2519fcdce8c4SStefano Zampini flops = 0.; 2520fcdce8c4SStefano Zampini } 2521fcdce8c4SStefano Zampini 2522fcdce8c4SStefano Zampini mmdata->flops = flops; 2523fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2524fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2525fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2526fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2527fcdce8c4SStefano Zampini NULL, NULL, NULL, 2528fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2529fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2530fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2531fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2532fcdce8c4SStefano Zampini stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2533fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2534fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2535fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2536bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2537fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2538fcdce8c4SStefano Zampini stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2539fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2540fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2541fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2542fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2543fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2545fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2546fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2547fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2548fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2549fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2550fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2551fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2552bfcc3627SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2553fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2554fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2555fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2556fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2557fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2558fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2559fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2560fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 256100702c57SStefano Zampini ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2562fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2563fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2564fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2565fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2566fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2567fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2568fcdce8c4SStefano Zampini stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2569fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2570fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2571fcdce8c4SStefano Zampini #else 2572fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2573fcdce8c4SStefano Zampini stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2574fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2575fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2576fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2577fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2578fcdce8c4SStefano Zampini c->nz = cnz; 2579fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2580fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2581fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2582fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2583fcdce8c4SStefano Zampini 2584fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2585fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2586fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2587fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2588fcdce8c4SStefano Zampini stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2589fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2590fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2591fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2592fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2593fcdce8c4SStefano Zampini #endif 2594fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2595fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2596fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2597fcdce8c4SStefano Zampini finalizesym: 2598fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2599fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2600fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2601fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2602fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2603fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2604fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2605fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2606fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2607fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2608fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2609fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2610fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2611fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2612fcdce8c4SStefano Zampini } else { 2613fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2614fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2615fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2616fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2617fcdce8c4SStefano Zampini } 2618fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2619fcdce8c4SStefano Zampini PetscInt r = 0; 2620fcdce8c4SStefano Zampini c->i[0] = 0; 2621fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2622fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2623fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2624fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2625fcdce8c4SStefano Zampini } 2626fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2627fcdce8c4SStefano Zampini } 2628fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2629fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2630fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2631fcdce8c4SStefano Zampini c->maxnz = c->nz; 2632fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2633fcdce8c4SStefano Zampini c->rmax = 0; 2634fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2635fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2636fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2637fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2638fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2639fcdce8c4SStefano Zampini } 2640fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2641fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2642fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2643fcdce8c4SStefano Zampini 2644fcdce8c4SStefano Zampini C->nonzerostate++; 2645fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2646fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2647fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2648fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2649fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2650fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2651fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2652abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2653fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2654fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2655fcdce8c4SStefano Zampini } 2656fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2657fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2658fcdce8c4SStefano Zampini } 2659fcdce8c4SStefano Zampini 2660fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2661fcdce8c4SStefano Zampini 2662fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2663fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2664fcdce8c4SStefano Zampini { 2665fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2666fcdce8c4SStefano Zampini PetscErrorCode ierr; 2667fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2668fcdce8c4SStefano Zampini 2669fcdce8c4SStefano Zampini PetscFunctionBegin; 2670fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2671fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2672abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 2673fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2674fcdce8c4SStefano Zampini } 2675fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2676fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2677fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2678fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2679fcdce8c4SStefano Zampini } 2680fcdce8c4SStefano Zampini } 2681fcdce8c4SStefano Zampini if (isdense) { 2682ccdfe979SStefano Zampini switch (product->type) { 2683ccdfe979SStefano Zampini case MATPRODUCT_AB: 2684ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2685ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2686ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2687ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2688fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2689fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2690fcdce8c4SStefano Zampini } else { 2691fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2692fcdce8c4SStefano Zampini } 2693fcdce8c4SStefano Zampini break; 2694fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2695fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2696fcdce8c4SStefano Zampini break; 2697ccdfe979SStefano Zampini default: 2698ccdfe979SStefano Zampini break; 2699ccdfe979SStefano Zampini } 2700fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2701fcdce8c4SStefano Zampini switch (product->type) { 2702fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2703fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2704fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2705fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2706fcdce8c4SStefano Zampini break; 2707fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2708fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2709fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2710fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2711fcdce8c4SStefano Zampini break; 2712fcdce8c4SStefano Zampini default: 2713fcdce8c4SStefano Zampini break; 2714fcdce8c4SStefano Zampini } 2715fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 2716fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2717fcdce8c4SStefano Zampini } 2718ccdfe979SStefano Zampini PetscFunctionReturn(0); 2719ccdfe979SStefano Zampini } 2720ccdfe979SStefano Zampini 27216fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 27229ae82921SPaul Mullowney { 2723b175d8bbSPaul Mullowney PetscErrorCode ierr; 27249ae82921SPaul Mullowney 27259ae82921SPaul Mullowney PetscFunctionBegin; 2726e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2727e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2728e6e9a74fSStefano Zampini } 2729e6e9a74fSStefano Zampini 2730e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2731e6e9a74fSStefano Zampini { 2732e6e9a74fSStefano Zampini PetscErrorCode ierr; 2733e6e9a74fSStefano Zampini 2734e6e9a74fSStefano Zampini PetscFunctionBegin; 2735e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2736e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2737e6e9a74fSStefano Zampini } 2738e6e9a74fSStefano Zampini 2739e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2740e6e9a74fSStefano Zampini { 2741e6e9a74fSStefano Zampini PetscErrorCode ierr; 2742e6e9a74fSStefano Zampini 2743e6e9a74fSStefano Zampini PetscFunctionBegin; 2744e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2745e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2746e6e9a74fSStefano Zampini } 2747e6e9a74fSStefano Zampini 2748e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2749e6e9a74fSStefano Zampini { 2750e6e9a74fSStefano Zampini PetscErrorCode ierr; 2751e6e9a74fSStefano Zampini 2752e6e9a74fSStefano Zampini PetscFunctionBegin; 2753e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 27549ae82921SPaul Mullowney PetscFunctionReturn(0); 27559ae82921SPaul Mullowney } 27569ae82921SPaul Mullowney 27576fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2758ca45077fSPaul Mullowney { 2759b175d8bbSPaul Mullowney PetscErrorCode ierr; 2760ca45077fSPaul Mullowney 2761ca45077fSPaul Mullowney PetscFunctionBegin; 2762e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2763ca45077fSPaul Mullowney PetscFunctionReturn(0); 2764ca45077fSPaul Mullowney } 2765ca45077fSPaul Mullowney 2766a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2767a0e72f99SJunchao Zhang { 2768a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2769a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2770a0e72f99SJunchao Zhang } 2771a0e72f99SJunchao Zhang 2772afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2773e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 27749ae82921SPaul Mullowney { 27759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2776aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 27779ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2778e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2779b175d8bbSPaul Mullowney PetscErrorCode ierr; 278057d48284SJunchao Zhang cudaError_t cerr; 2781aa372e3fSPaul Mullowney cusparseStatus_t stat; 2782e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2783e6e9a74fSStefano Zampini PetscBool compressed; 2784afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2785afb2bd1cSJunchao Zhang PetscInt nx,ny; 2786afb2bd1cSJunchao Zhang #endif 27876e111a19SKarl Rupp 27889ae82921SPaul Mullowney PetscFunctionBegin; 2789e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2790e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2791afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2792d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2793e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2794e6e9a74fSStefano Zampini } 279534d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 279634d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2797e6e9a74fSStefano Zampini if (!trans) { 27989ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2799c9567895SMark if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2800e6e9a74fSStefano Zampini } else { 28011a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 2802e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2803e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2804e6e9a74fSStefano Zampini } else { 28051a2c6b5cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2806e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2807e6e9a74fSStefano Zampini } 2808e6e9a74fSStefano Zampini } 2809e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2810e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2811213423ffSJunchao Zhang 2812e6e9a74fSStefano Zampini try { 2813e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2814213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2815213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2816afb2bd1cSJunchao Zhang 281785ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2818e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2819afb2bd1cSJunchao Zhang /* z = A x + beta y. 2820afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2821afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2822afb2bd1cSJunchao Zhang */ 2823e6e9a74fSStefano Zampini xptr = xarray; 2824afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2825213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2826afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2827afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2828afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2829afb2bd1cSJunchao Zhang */ 2830afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2831afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2832afb2bd1cSJunchao Zhang nx = mat->num_cols; 2833afb2bd1cSJunchao Zhang ny = mat->num_rows; 2834afb2bd1cSJunchao Zhang } 2835afb2bd1cSJunchao Zhang #endif 2836e6e9a74fSStefano Zampini } else { 2837afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2838afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2839afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2840afb2bd1cSJunchao Zhang */ 2841afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2842e6e9a74fSStefano Zampini dptr = zarray; 2843e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2844afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2845e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2846a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2847e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2848e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2849e6e9a74fSStefano Zampini } 2850afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2851afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2852afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2853afb2bd1cSJunchao Zhang nx = mat->num_rows; 2854afb2bd1cSJunchao Zhang ny = mat->num_cols; 2855afb2bd1cSJunchao Zhang } 2856afb2bd1cSJunchao Zhang #endif 2857e6e9a74fSStefano Zampini } 28589ae82921SPaul Mullowney 2859afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2860aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2861afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2862afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2863afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2864afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2865afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2866afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2867afb2bd1cSJunchao Zhang matstruct->matDescr, 2868afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2869afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2870afb2bd1cSJunchao Zhang cusparse_scalartype, 2871afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2872afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2873afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2874afb2bd1cSJunchao Zhang 2875afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2876afb2bd1cSJunchao Zhang } else { 2877afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2878afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2879afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2880afb2bd1cSJunchao Zhang } 2881afb2bd1cSJunchao Zhang 2882afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2883afb2bd1cSJunchao Zhang matstruct->alpha_one, 28841a2c6b5cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2885afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2886afb2bd1cSJunchao Zhang beta, 2887afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2888afb2bd1cSJunchao Zhang cusparse_scalartype, 2889afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2890afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2891afb2bd1cSJunchao Zhang #else 28927656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2893e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2894a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2895afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2896aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2897e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 289857d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2899afb2bd1cSJunchao Zhang #endif 2900aa372e3fSPaul Mullowney } else { 2901213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2902afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2903afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2904afb2bd1cSJunchao Zhang #else 2905301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2906e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2907afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2908e6e9a74fSStefano Zampini xptr, beta, 290957d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2910afb2bd1cSJunchao Zhang #endif 2911a65300a6SPaul Mullowney } 2912aa372e3fSPaul Mullowney } 291305035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2914958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2915aa372e3fSPaul Mullowney 2916e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2917213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2918213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2919213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2920e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2921213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 29227656d835SStefano Zampini } 2923213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2924c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 29257656d835SStefano Zampini } 29267656d835SStefano Zampini 2927213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2928213423ffSJunchao Zhang if (compressed) { 2929e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2930a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2931a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2932a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 2933a0e72f99SJunchao Zhang */ 2934a0e72f99SJunchao Zhang #if 0 2935a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2936a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2937a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2938e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2939c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 2940a0e72f99SJunchao Zhang #else 2941a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 2942a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2943a0e72f99SJunchao Zhang #endif 294405035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2945958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2946e6e9a74fSStefano Zampini } 2947e6e9a74fSStefano Zampini } else { 2948e6e9a74fSStefano Zampini if (yy && yy != zz) { 2949e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2950e6e9a74fSStefano Zampini } 2951e6e9a74fSStefano Zampini } 2952e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2953213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2954213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 29559ae82921SPaul Mullowney } catch(char *ex) { 29569ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 29579ae82921SPaul Mullowney } 2958e6e9a74fSStefano Zampini if (yy) { 2959958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2960e6e9a74fSStefano Zampini } else { 2961e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2962e6e9a74fSStefano Zampini } 29639ae82921SPaul Mullowney PetscFunctionReturn(0); 29649ae82921SPaul Mullowney } 29659ae82921SPaul Mullowney 29666fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2967ca45077fSPaul Mullowney { 2968b175d8bbSPaul Mullowney PetscErrorCode ierr; 29696e111a19SKarl Rupp 2970ca45077fSPaul Mullowney PetscFunctionBegin; 2971e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2972ca45077fSPaul Mullowney PetscFunctionReturn(0); 2973ca45077fSPaul Mullowney } 2974ca45077fSPaul Mullowney 29756fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 29769ae82921SPaul Mullowney { 29779ae82921SPaul Mullowney PetscErrorCode ierr; 2978a587d139SMark PetscSplitCSRDataStructure *d_mat = NULL; 29799ae82921SPaul Mullowney PetscFunctionBegin; 2980bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 29813fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2982bc3f50f2SPaul Mullowney } 29833fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 29843fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2985a587d139SMark if (d_mat) { 29863fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 29873fa6b06aSMark Adams } 29883fa6b06aSMark Adams 29899ae82921SPaul Mullowney PetscFunctionReturn(0); 29909ae82921SPaul Mullowney } 29919ae82921SPaul Mullowney 29929ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2993e057df02SPaul Mullowney /*@ 29949ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2995e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2996e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2997e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2998e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2999e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 30009ae82921SPaul Mullowney 3001d083f849SBarry Smith Collective 30029ae82921SPaul Mullowney 30039ae82921SPaul Mullowney Input Parameters: 30049ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 30059ae82921SPaul Mullowney . m - number of rows 30069ae82921SPaul Mullowney . n - number of columns 30079ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 30089ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 30090298fd71SBarry Smith (possibly different for each row) or NULL 30109ae82921SPaul Mullowney 30119ae82921SPaul Mullowney Output Parameter: 30129ae82921SPaul Mullowney . A - the matrix 30139ae82921SPaul Mullowney 30149ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 30159ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 30169ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 30179ae82921SPaul Mullowney 30189ae82921SPaul Mullowney Notes: 30199ae82921SPaul Mullowney If nnz is given then nz is ignored 30209ae82921SPaul Mullowney 30219ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 30229ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 30239ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 30249ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 30259ae82921SPaul Mullowney 30269ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 30270298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 30289ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 30299ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 30309ae82921SPaul Mullowney 30319ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 30329ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 30339ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 30349ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 30359ae82921SPaul Mullowney 30369ae82921SPaul Mullowney Level: intermediate 30379ae82921SPaul Mullowney 3038e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 30399ae82921SPaul Mullowney @*/ 30409ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 30419ae82921SPaul Mullowney { 30429ae82921SPaul Mullowney PetscErrorCode ierr; 30439ae82921SPaul Mullowney 30449ae82921SPaul Mullowney PetscFunctionBegin; 30459ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 30469ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 30479ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 30489ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 30499ae82921SPaul Mullowney PetscFunctionReturn(0); 30509ae82921SPaul Mullowney } 30519ae82921SPaul Mullowney 30526fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 30539ae82921SPaul Mullowney { 30549ae82921SPaul Mullowney PetscErrorCode ierr; 30553fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 3056ab25e6cbSDominic Meiser 30579ae82921SPaul Mullowney PetscFunctionBegin; 30589ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 30593fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 30603fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3061470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 30629ae82921SPaul Mullowney } else { 3063470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3064aa372e3fSPaul Mullowney } 30653fa6b06aSMark Adams if (d_mat) { 30663fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30673fa6b06aSMark Adams cudaError_t err; 30683fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 30693fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 30703fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 30713fa6b06aSMark Adams if (a->compressedrow.use) { 30723fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 30733fa6b06aSMark Adams } 30743fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 30753fa6b06aSMark Adams } 3076c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3077ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3078ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3079ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3080fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3081ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 30827e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 30837e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 30849ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 30859ae82921SPaul Mullowney PetscFunctionReturn(0); 30869ae82921SPaul Mullowney } 30879ae82921SPaul Mullowney 3088ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 308995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 30909ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 30919ff858a8SKarl Rupp { 30929ff858a8SKarl Rupp PetscErrorCode ierr; 30939ff858a8SKarl Rupp 30949ff858a8SKarl Rupp PetscFunctionBegin; 30959ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3096ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 30979ff858a8SKarl Rupp PetscFunctionReturn(0); 30989ff858a8SKarl Rupp } 30999ff858a8SKarl Rupp 3100039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 310195639643SRichard Tran Mills { 3102e6e9a74fSStefano Zampini PetscErrorCode ierr; 3103a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3104039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3105039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3106039c6fbaSStefano Zampini PetscScalar *ay; 3107039c6fbaSStefano Zampini const PetscScalar *ax; 3108039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3109039c6fbaSStefano Zampini cudaError_t cerr; 3110e6e9a74fSStefano Zampini 311195639643SRichard Tran Mills PetscFunctionBegin; 3112a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3113a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3114039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 3115a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3116a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3117a587d139SMark PetscFunctionReturn(0); 311895639643SRichard Tran Mills } 3119039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 3120a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3121a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3122039c6fbaSStefano Zampini if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3123039c6fbaSStefano Zampini if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3124039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3125039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3126039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3127039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3128039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3129039c6fbaSStefano Zampini if (eq) { 3130039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3131039c6fbaSStefano Zampini } 3132039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3133039c6fbaSStefano Zampini } 3134d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3135d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3136039c6fbaSStefano Zampini 3137039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3138039c6fbaSStefano Zampini cusparseStatus_t stat; 3139039c6fbaSStefano Zampini PetscScalar b = 1.0; 3140039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3141039c6fbaSStefano Zampini size_t bufferSize; 3142039c6fbaSStefano Zampini void *buffer; 3143039c6fbaSStefano Zampini #endif 3144039c6fbaSStefano Zampini 3145039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3146039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3147039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3148039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3149039c6fbaSStefano Zampini stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3150039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3151039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3152039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3153039c6fbaSStefano Zampini cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3154039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3155039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3156039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3157039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3158039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3159039c6fbaSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3160039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3161039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3162039c6fbaSStefano Zampini cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3163039c6fbaSStefano Zampini #else 3164039c6fbaSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3165039c6fbaSStefano Zampini stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3166039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3167039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3168039c6fbaSStefano Zampini cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3169039c6fbaSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3170039c6fbaSStefano Zampini ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3171039c6fbaSStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3172039c6fbaSStefano Zampini #endif 3173039c6fbaSStefano Zampini stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3174039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3175039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3176039c6fbaSStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3177039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3178a587d139SMark cublasHandle_t cublasv2handle; 3179039c6fbaSStefano Zampini cublasStatus_t berr; 3180a587d139SMark PetscBLASInt one = 1, bnz = 1; 3181039c6fbaSStefano Zampini 3182039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3183039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3184a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3185a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3186a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3187039c6fbaSStefano Zampini berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3188039c6fbaSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3189a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3190a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3191039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3192039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3193a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3194039c6fbaSStefano Zampini } else { 3195a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3196d2be01edSStefano Zampini ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3197a587d139SMark } 319895639643SRichard Tran Mills PetscFunctionReturn(0); 319995639643SRichard Tran Mills } 320095639643SRichard Tran Mills 320133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 320233c9ba73SStefano Zampini { 320333c9ba73SStefano Zampini PetscErrorCode ierr; 320433c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 320533c9ba73SStefano Zampini PetscScalar *ay; 320633c9ba73SStefano Zampini cudaError_t cerr; 320733c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 320833c9ba73SStefano Zampini cublasStatus_t berr; 320933c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 321033c9ba73SStefano Zampini 321133c9ba73SStefano Zampini PetscFunctionBegin; 321233c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 321333c9ba73SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 321433c9ba73SStefano Zampini ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 321533c9ba73SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 321633c9ba73SStefano Zampini berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 321733c9ba73SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 321833c9ba73SStefano Zampini ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 321933c9ba73SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 322033c9ba73SStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 322133c9ba73SStefano Zampini ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 322233c9ba73SStefano Zampini PetscFunctionReturn(0); 322333c9ba73SStefano Zampini } 322433c9ba73SStefano Zampini 32253fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 32263fa6b06aSMark Adams { 32273fa6b06aSMark Adams PetscErrorCode ierr; 32287e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3229a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 32307e8381f9SStefano Zampini 32313fa6b06aSMark Adams PetscFunctionBegin; 32323fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 32333fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 32347e8381f9SStefano Zampini if (spptr->mat) { 32357e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 32367e8381f9SStefano Zampini if (matrix->values) { 32377e8381f9SStefano Zampini both = PETSC_TRUE; 32387e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 32397e8381f9SStefano Zampini } 32407e8381f9SStefano Zampini } 32417e8381f9SStefano Zampini if (spptr->matTranspose) { 32427e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 32437e8381f9SStefano Zampini if (matrix->values) { 32447e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 32457e8381f9SStefano Zampini } 32467e8381f9SStefano Zampini } 32473fa6b06aSMark Adams } 3248a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3249a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3250a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 32517e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3252a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 32533fa6b06aSMark Adams 32543fa6b06aSMark Adams PetscFunctionReturn(0); 32553fa6b06aSMark Adams } 32563fa6b06aSMark Adams 3257a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3258a587d139SMark { 3259a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3260a587d139SMark PetscErrorCode ierr; 3261a587d139SMark 3262a587d139SMark PetscFunctionBegin; 3263a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3264a587d139SMark if (flg) { 3265a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3266a587d139SMark 326733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3268a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3269a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3270a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3271a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3272a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3273a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3274a587d139SMark A->ops->multhermitiantranspose = NULL; 3275a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3276fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3277c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3278a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3279a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3280a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3281a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3282a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3283fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3284a587d139SMark } else { 328533c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3286a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3287a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3288a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3289a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3290a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3291a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3292a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3293a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3294fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3295c215019aSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3296a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3297a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3298a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3299a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3300a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3301fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3302a587d139SMark } 3303a587d139SMark A->boundtocpu = flg; 3304a587d139SMark a->inode.use = flg; 3305a587d139SMark PetscFunctionReturn(0); 3306a587d139SMark } 3307a587d139SMark 330849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 33099ae82921SPaul Mullowney { 33109ae82921SPaul Mullowney PetscErrorCode ierr; 3311aa372e3fSPaul Mullowney cusparseStatus_t stat; 331249735bf3SStefano Zampini Mat B; 33139ae82921SPaul Mullowney 33149ae82921SPaul Mullowney PetscFunctionBegin; 3315832b2c02SStefano Zampini ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 331649735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 331749735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 331849735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 331949735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 332049735bf3SStefano Zampini } 332149735bf3SStefano Zampini B = *newmat; 332249735bf3SStefano Zampini 332334136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 332434136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 332534136279SStefano Zampini 332649735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 33279ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3328e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3329e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3330e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3331a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 33321a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3333d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3334d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3335d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3336d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3337d8132acaSStefano Zampini #endif 33381a2c6b5cSJunchao Zhang B->spptr = spptr; 33399ae82921SPaul Mullowney } else { 3340e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3341e6e9a74fSStefano Zampini 3342e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3343e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3344a0e72f99SJunchao Zhang stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3345e6e9a74fSStefano Zampini B->spptr = spptr; 33469ae82921SPaul Mullowney } 3347e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 334849735bf3SStefano Zampini } 3349693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 33509ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 33511a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 33529ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 335395639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3354693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 33552205254eSKarl Rupp 3356e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 33579ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3358bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 33599ae82921SPaul Mullowney PetscFunctionReturn(0); 33609ae82921SPaul Mullowney } 33619ae82921SPaul Mullowney 336202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 336302fe1965SBarry Smith { 336402fe1965SBarry Smith PetscErrorCode ierr; 336502fe1965SBarry Smith 336602fe1965SBarry Smith PetscFunctionBegin; 336702fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 33680ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 336902fe1965SBarry Smith PetscFunctionReturn(0); 337002fe1965SBarry Smith } 337102fe1965SBarry Smith 33723ca39a21SBarry Smith /*MC 3373e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3374e057df02SPaul Mullowney 3375e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 33762692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 33772692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3378e057df02SPaul Mullowney 3379e057df02SPaul Mullowney Options Database Keys: 3380e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3381aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3382a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3383e057df02SPaul Mullowney 3384e057df02SPaul Mullowney Level: beginner 3385e057df02SPaul Mullowney 33868468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3387e057df02SPaul Mullowney M*/ 33887f756511SDominic Meiser 3389*bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 33900f39cd5aSBarry Smith 33913ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 339242c9c57cSBarry Smith { 339342c9c57cSBarry Smith PetscErrorCode ierr; 339442c9c57cSBarry Smith 339542c9c57cSBarry Smith PetscFunctionBegin; 3396*bddcd29dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 33973ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 33983ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 33993ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 34003ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3401*bddcd29dSMark Adams 340242c9c57cSBarry Smith PetscFunctionReturn(0); 340342c9c57cSBarry Smith } 340429b38603SBarry Smith 3405470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 34067f756511SDominic Meiser { 3407e6e9a74fSStefano Zampini PetscErrorCode ierr; 34087f756511SDominic Meiser cusparseStatus_t stat; 34097f756511SDominic Meiser 34107f756511SDominic Meiser PetscFunctionBegin; 34117f756511SDominic Meiser if (*cusparsestruct) { 3412e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3413e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 34147f756511SDominic Meiser delete (*cusparsestruct)->workVector; 341581902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 34167e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 34177e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3418a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 34197e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3420e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 34217f756511SDominic Meiser } 34227f756511SDominic Meiser PetscFunctionReturn(0); 34237f756511SDominic Meiser } 34247f756511SDominic Meiser 34257f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 34267f756511SDominic Meiser { 34277f756511SDominic Meiser PetscFunctionBegin; 34287f756511SDominic Meiser if (*mat) { 34297f756511SDominic Meiser delete (*mat)->values; 34307f756511SDominic Meiser delete (*mat)->column_indices; 34317f756511SDominic Meiser delete (*mat)->row_offsets; 34327f756511SDominic Meiser delete *mat; 34337f756511SDominic Meiser *mat = 0; 34347f756511SDominic Meiser } 34357f756511SDominic Meiser PetscFunctionReturn(0); 34367f756511SDominic Meiser } 34377f756511SDominic Meiser 3438470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 34397f756511SDominic Meiser { 34407f756511SDominic Meiser cusparseStatus_t stat; 34417f756511SDominic Meiser PetscErrorCode ierr; 34427f756511SDominic Meiser 34437f756511SDominic Meiser PetscFunctionBegin; 34447f756511SDominic Meiser if (*trifactor) { 344557d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3446afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 34477f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 34481b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 34492cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3450afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 34511b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3452afb2bd1cSJunchao Zhang #endif 3453da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 34547f756511SDominic Meiser } 34557f756511SDominic Meiser PetscFunctionReturn(0); 34567f756511SDominic Meiser } 34577f756511SDominic Meiser 3458470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 34597f756511SDominic Meiser { 34607f756511SDominic Meiser CsrMatrix *mat; 34617f756511SDominic Meiser cusparseStatus_t stat; 34627f756511SDominic Meiser cudaError_t err; 34637f756511SDominic Meiser 34647f756511SDominic Meiser PetscFunctionBegin; 34657f756511SDominic Meiser if (*matstruct) { 34667f756511SDominic Meiser if ((*matstruct)->mat) { 34677f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3468afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3469afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3470afb2bd1cSJunchao Zhang #else 34717f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 347257d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3473afb2bd1cSJunchao Zhang #endif 34747f756511SDominic Meiser } else { 34757f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 34767f756511SDominic Meiser CsrMatrix_Destroy(&mat); 34777f756511SDominic Meiser } 34787f756511SDominic Meiser } 347957d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 34807f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3481afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 34827656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 34837656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3484afb2bd1cSJunchao Zhang 3485afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3486afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3487afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3488afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3489afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3490afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3491afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3492afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3493afb2bd1cSJunchao Zhang } 3494afb2bd1cSJunchao Zhang } 3495afb2bd1cSJunchao Zhang #endif 34967f756511SDominic Meiser delete *matstruct; 34977e8381f9SStefano Zampini *matstruct = NULL; 34987f756511SDominic Meiser } 34997f756511SDominic Meiser PetscFunctionReturn(0); 35007f756511SDominic Meiser } 35017f756511SDominic Meiser 3502ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 35037f756511SDominic Meiser { 3504e6e9a74fSStefano Zampini PetscErrorCode ierr; 3505e6e9a74fSStefano Zampini 35067f756511SDominic Meiser PetscFunctionBegin; 35077f756511SDominic Meiser if (*trifactors) { 3508e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3509e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3510e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3511e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 35127f756511SDominic Meiser delete (*trifactors)->rpermIndices; 35137f756511SDominic Meiser delete (*trifactors)->cpermIndices; 35147f756511SDominic Meiser delete (*trifactors)->workVector; 35157e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 35167e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 35177e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3518*bddcd29dSMark Adams if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3519*bddcd29dSMark Adams if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3520ccdfe979SStefano Zampini } 3521ccdfe979SStefano Zampini PetscFunctionReturn(0); 3522ccdfe979SStefano Zampini } 3523ccdfe979SStefano Zampini 3524ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3525ccdfe979SStefano Zampini { 3526e6e9a74fSStefano Zampini PetscErrorCode ierr; 3527ccdfe979SStefano Zampini cusparseHandle_t handle; 3528ccdfe979SStefano Zampini cusparseStatus_t stat; 3529ccdfe979SStefano Zampini 3530ccdfe979SStefano Zampini PetscFunctionBegin; 3531ccdfe979SStefano Zampini if (*trifactors) { 3532e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 35337f756511SDominic Meiser if (handle = (*trifactors)->handle) { 353457d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 35357f756511SDominic Meiser } 3536e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 35377f756511SDominic Meiser } 35387f756511SDominic Meiser PetscFunctionReturn(0); 35397f756511SDominic Meiser } 35407e8381f9SStefano Zampini 35417e8381f9SStefano Zampini struct IJCompare 35427e8381f9SStefano Zampini { 35437e8381f9SStefano Zampini __host__ __device__ 35447e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 35457e8381f9SStefano Zampini { 35467e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 35477e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 35487e8381f9SStefano Zampini return false; 35497e8381f9SStefano Zampini } 35507e8381f9SStefano Zampini }; 35517e8381f9SStefano Zampini 35527e8381f9SStefano Zampini struct IJEqual 35537e8381f9SStefano Zampini { 35547e8381f9SStefano Zampini __host__ __device__ 35557e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 35567e8381f9SStefano Zampini { 35577e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 35587e8381f9SStefano Zampini return true; 35597e8381f9SStefano Zampini } 35607e8381f9SStefano Zampini }; 35617e8381f9SStefano Zampini 35627e8381f9SStefano Zampini struct IJDiff 35637e8381f9SStefano Zampini { 35647e8381f9SStefano Zampini __host__ __device__ 35657e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 35667e8381f9SStefano Zampini { 35677e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 35687e8381f9SStefano Zampini } 35697e8381f9SStefano Zampini }; 35707e8381f9SStefano Zampini 35717e8381f9SStefano Zampini struct IJSum 35727e8381f9SStefano Zampini { 35737e8381f9SStefano Zampini __host__ __device__ 35747e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 35757e8381f9SStefano Zampini { 35767e8381f9SStefano Zampini return t1||t2; 35777e8381f9SStefano Zampini } 35787e8381f9SStefano Zampini }; 35797e8381f9SStefano Zampini 35807e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3581e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 35827e8381f9SStefano Zampini { 35837e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3584fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3585bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 358608391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 35877e8381f9SStefano Zampini CsrMatrix *matrix; 35887e8381f9SStefano Zampini PetscErrorCode ierr; 35897e8381f9SStefano Zampini cudaError_t cerr; 35907e8381f9SStefano Zampini PetscInt n; 35917e8381f9SStefano Zampini 35927e8381f9SStefano Zampini PetscFunctionBegin; 35937e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 35947e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 35957e8381f9SStefano Zampini if (!cusp->cooPerm) { 35967e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 35977e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 35987e8381f9SStefano Zampini PetscFunctionReturn(0); 35997e8381f9SStefano Zampini } 36007e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 36017e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3602e61fc153SStefano Zampini if (!v) { 3603e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3604e61fc153SStefano Zampini goto finalize; 36057e8381f9SStefano Zampini } 3606e61fc153SStefano Zampini n = cusp->cooPerm->size(); 360708391a17SStefano Zampini if (isCudaMem(v)) { 360808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 360908391a17SStefano Zampini } else { 3610e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3611e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 361208391a17SStefano Zampini d_v = cooPerm_v->data(); 3613e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 361408391a17SStefano Zampini } 3615bfcc3627SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3616e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 36177e8381f9SStefano Zampini if (cusp->cooPerm_a) { 3618bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 361908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3620e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3621e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3622e61fc153SStefano Zampini delete cooPerm_w; 36237e8381f9SStefano Zampini } else { 362408391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 36257e8381f9SStefano Zampini matrix->values->begin())); 362608391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 36277e8381f9SStefano Zampini matrix->values->end())); 36287e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 36297e8381f9SStefano Zampini } 36307e8381f9SStefano Zampini } else { 3631e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 363208391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3633e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 36347e8381f9SStefano Zampini } else { 363508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 36367e8381f9SStefano Zampini matrix->values->begin())); 363708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 36387e8381f9SStefano Zampini matrix->values->end())); 36397e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 36407e8381f9SStefano Zampini } 36417e8381f9SStefano Zampini } 36427e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3643bfcc3627SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3644e61fc153SStefano Zampini finalize: 3645e61fc153SStefano Zampini delete cooPerm_v; 36467e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3647e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3648fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3649fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3650fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3651fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3652fcdce8c4SStefano Zampini a->reallocs = 0; 3653fcdce8c4SStefano Zampini A->info.mallocs += 0; 3654fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3655fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3656fcdce8c4SStefano Zampini A->num_ass++; 36577e8381f9SStefano Zampini PetscFunctionReturn(0); 36587e8381f9SStefano Zampini } 36597e8381f9SStefano Zampini 3660a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3661a49f1ed0SStefano Zampini { 3662a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3663a49f1ed0SStefano Zampini PetscErrorCode ierr; 3664a49f1ed0SStefano Zampini 3665a49f1ed0SStefano Zampini PetscFunctionBegin; 3666a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3667a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3668a49f1ed0SStefano Zampini if (destroy) { 3669a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3670a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3671a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3672a49f1ed0SStefano Zampini } 36731a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3674a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3675a49f1ed0SStefano Zampini } 3676a49f1ed0SStefano Zampini 36777e8381f9SStefano Zampini #include <thrust/binary_search.h> 3678e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 36797e8381f9SStefano Zampini { 36807e8381f9SStefano Zampini PetscErrorCode ierr; 36817e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 36827e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 36837e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 36847e8381f9SStefano Zampini cudaError_t cerr; 36857e8381f9SStefano Zampini 36867e8381f9SStefano Zampini PetscFunctionBegin; 36877e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 36887e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 36897e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 36907e8381f9SStefano Zampini if (n != cooPerm_n) { 36917e8381f9SStefano Zampini delete cusp->cooPerm; 36927e8381f9SStefano Zampini delete cusp->cooPerm_a; 36937e8381f9SStefano Zampini cusp->cooPerm = NULL; 36947e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 36957e8381f9SStefano Zampini } 36967e8381f9SStefano Zampini if (n) { 36977e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 36987e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 36997e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 37007e8381f9SStefano Zampini 37017e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 37027e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 37037e8381f9SStefano Zampini 37047e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 37057e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 37067e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 37077e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 37087e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 37097e8381f9SStefano Zampini 371008391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 37117e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 37127e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 37137e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 37147e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 37157e8381f9SStefano Zampini 37167e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 37177e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 37187e8381f9SStefano Zampini delete cusp->cooPerm_a; 37197e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 37207e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 37217e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 37227e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 37237e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 37247e8381f9SStefano Zampini w[0] = 0; 37257e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 37267e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 37277e8381f9SStefano Zampini } 37287e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 37297e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 37307e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 37317e8381f9SStefano Zampini ii.begin()); 373208391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 373308391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 37347e8381f9SStefano Zampini 37357e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 37367e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 37377e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 37387e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 37397e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 37407e8381f9SStefano Zampini a->i[0] = 0; 37417e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 37427e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3743fcdce8c4SStefano Zampini a->rmax = 0; 37447e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 37457e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 37467e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 37477e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 37487e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 37497e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 37507e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 37517e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 37527e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3753fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 37547e8381f9SStefano Zampini } 3755fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 37567e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 37577e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3758fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 37597e8381f9SStefano Zampini } else { 37607e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 37617e8381f9SStefano Zampini } 3762e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 37637e8381f9SStefano Zampini 37647e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3765e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 3766e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 37677e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 37687e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 37697e8381f9SStefano Zampini A->nonzerostate++; 37707e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3771a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 37727e8381f9SStefano Zampini 37737e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 37747e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 37757e8381f9SStefano Zampini PetscFunctionReturn(0); 37767e8381f9SStefano Zampini } 3777ed502f03SStefano Zampini 3778ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3779ed502f03SStefano Zampini { 3780ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3781ed502f03SStefano Zampini CsrMatrix *csr; 3782ed502f03SStefano Zampini PetscErrorCode ierr; 3783ed502f03SStefano Zampini 3784ed502f03SStefano Zampini PetscFunctionBegin; 3785ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3786ed502f03SStefano Zampini PetscValidPointer(a,2); 3787ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3788ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3789ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 379033c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3791ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3792ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3793ed502f03SStefano Zampini *a = csr->values->data().get(); 3794ed502f03SStefano Zampini PetscFunctionReturn(0); 3795ed502f03SStefano Zampini } 3796ed502f03SStefano Zampini 3797ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3798ed502f03SStefano Zampini { 3799ed502f03SStefano Zampini PetscFunctionBegin; 3800ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3801ed502f03SStefano Zampini PetscValidPointer(a,2); 3802ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3803ed502f03SStefano Zampini *a = NULL; 3804ed502f03SStefano Zampini PetscFunctionReturn(0); 3805ed502f03SStefano Zampini } 3806ed502f03SStefano Zampini 3807039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3808039c6fbaSStefano Zampini { 3809039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3810039c6fbaSStefano Zampini CsrMatrix *csr; 3811039c6fbaSStefano Zampini PetscErrorCode ierr; 3812039c6fbaSStefano Zampini 3813039c6fbaSStefano Zampini PetscFunctionBegin; 3814039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3815039c6fbaSStefano Zampini PetscValidPointer(a,2); 3816039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3817039c6fbaSStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3818039c6fbaSStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 381933c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3820039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3821039c6fbaSStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3822039c6fbaSStefano Zampini *a = csr->values->data().get(); 3823039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3824a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3825039c6fbaSStefano Zampini PetscFunctionReturn(0); 3826039c6fbaSStefano Zampini } 3827039c6fbaSStefano Zampini 3828039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3829039c6fbaSStefano Zampini { 3830039c6fbaSStefano Zampini PetscErrorCode ierr; 3831039c6fbaSStefano Zampini 3832039c6fbaSStefano Zampini PetscFunctionBegin; 3833039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3834039c6fbaSStefano Zampini PetscValidPointer(a,2); 3835039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3836039c6fbaSStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3837039c6fbaSStefano Zampini *a = NULL; 3838039c6fbaSStefano Zampini PetscFunctionReturn(0); 3839039c6fbaSStefano Zampini } 3840039c6fbaSStefano Zampini 3841ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3842ed502f03SStefano Zampini { 3843ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3844ed502f03SStefano Zampini CsrMatrix *csr; 3845a49f1ed0SStefano Zampini PetscErrorCode ierr; 3846ed502f03SStefano Zampini 3847ed502f03SStefano Zampini PetscFunctionBegin; 3848ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3849ed502f03SStefano Zampini PetscValidPointer(a,2); 3850ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3851ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 385233c9ba73SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3853ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3854ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3855ed502f03SStefano Zampini *a = csr->values->data().get(); 3856039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3857a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3858ed502f03SStefano Zampini PetscFunctionReturn(0); 3859ed502f03SStefano Zampini } 3860ed502f03SStefano Zampini 3861ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3862ed502f03SStefano Zampini { 3863ed502f03SStefano Zampini PetscErrorCode ierr; 3864ed502f03SStefano Zampini 3865ed502f03SStefano Zampini PetscFunctionBegin; 3866ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3867ed502f03SStefano Zampini PetscValidPointer(a,2); 3868ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3869ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3870ed502f03SStefano Zampini *a = NULL; 3871ed502f03SStefano Zampini PetscFunctionReturn(0); 3872ed502f03SStefano Zampini } 3873ed502f03SStefano Zampini 3874ed502f03SStefano Zampini struct IJCompare4 3875ed502f03SStefano Zampini { 3876ed502f03SStefano Zampini __host__ __device__ 38772ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3878ed502f03SStefano Zampini { 3879ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 3880ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3881ed502f03SStefano Zampini return false; 3882ed502f03SStefano Zampini } 3883ed502f03SStefano Zampini }; 3884ed502f03SStefano Zampini 38858909a122SStefano Zampini struct Shift 38868909a122SStefano Zampini { 3887ed502f03SStefano Zampini int _shift; 3888ed502f03SStefano Zampini 3889ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 3890ed502f03SStefano Zampini __host__ __device__ 3891ed502f03SStefano Zampini inline int operator() (const int &c) 3892ed502f03SStefano Zampini { 3893ed502f03SStefano Zampini return c + _shift; 3894ed502f03SStefano Zampini } 3895ed502f03SStefano Zampini }; 3896ed502f03SStefano Zampini 3897ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3898ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3899ed502f03SStefano Zampini { 3900ed502f03SStefano Zampini PetscErrorCode ierr; 3901ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3902ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3903ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3904ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3905ed502f03SStefano Zampini PetscInt Annz,Bnnz; 3906ed502f03SStefano Zampini cusparseStatus_t stat; 3907ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 3908ed502f03SStefano Zampini cudaError_t cerr; 3909ed502f03SStefano Zampini 3910ed502f03SStefano Zampini PetscFunctionBegin; 3911ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3912ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3913ed502f03SStefano Zampini PetscValidPointer(C,4); 3914ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3915ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3916ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3917ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3918ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3919ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3920ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 3921ed502f03SStefano Zampini m = A->rmap->n; 3922ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 3923ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3924ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3925ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3926ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 3927ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3928ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3929ed502f03SStefano Zampini Ccsr = new CsrMatrix; 3930ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 3931ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 3932ed502f03SStefano Zampini c->compressedrow.nrows = 0; 3933ed502f03SStefano Zampini c->compressedrow.i = NULL; 3934ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 3935ed502f03SStefano Zampini Ccusp->workVector = NULL; 3936ed502f03SStefano Zampini Ccusp->nrows = m; 3937ed502f03SStefano Zampini Ccusp->mat = Cmat; 3938ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 3939ed502f03SStefano Zampini Ccsr->num_rows = m; 3940ed502f03SStefano Zampini Ccsr->num_cols = n; 3941ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3942ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3943ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3944ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3945ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3946ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3947ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3948ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3949ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3950ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3951ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 39521a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 39531a2c6b5cSJunchao Zhang ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3954ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3955ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3956ed502f03SStefano Zampini 3957ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 3958ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3959ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 3960ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 3961ed502f03SStefano Zampini c->nz = Annz + Bnnz; 3962ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3963ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3964ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3965ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 3966ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3967ed502f03SStefano Zampini if (c->nz) { 39682ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 39692ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 39702ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 39712ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 39722ed87e7eSStefano Zampini 3973ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 3974ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 3975ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3976ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3977ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3978ed502f03SStefano Zampini } 39792ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 39802ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 3981ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 3982ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3983ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3984ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3985ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3986ed502f03SStefano Zampini } 39872ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 39882ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 3989ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 39902ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 39912ed87e7eSStefano Zampini Aroff->data().get(), 39922ed87e7eSStefano Zampini Annz, 39932ed87e7eSStefano Zampini m, 39942ed87e7eSStefano Zampini Acoo->data().get(), 39952ed87e7eSStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3996ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 39972ed87e7eSStefano Zampini Broff->data().get(), 3998ed502f03SStefano Zampini Bnnz, 3999ed502f03SStefano Zampini m, 40002ed87e7eSStefano Zampini Bcoo->data().get(), 4001ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 40022ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 40032ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 40042ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 40058909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4006ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4007ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 40088909a122SStefano Zampini #else 40098909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 40108909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 40118909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 40128909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 40138909a122SStefano Zampini #endif 40142ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 40152ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 40162ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 40172ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 40182ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 40192ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4020ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4021ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4022ed502f03SStefano Zampini thrust::advance(p2,Annz); 40232ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 40248909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 40258909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 40268909a122SStefano Zampini #endif 40272ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 40282ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 40292ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 40302ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 40312ed87e7eSStefano Zampini #else 40322ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 40332ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 40342ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 40352ed87e7eSStefano Zampini #endif 4036ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 40372ed87e7eSStefano Zampini Ccoo->data().get(), 4038ed502f03SStefano Zampini c->nz, 4039ed502f03SStefano Zampini m, 4040ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 4041ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4042ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 4043ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 40442ed87e7eSStefano Zampini delete wPerm; 40452ed87e7eSStefano Zampini delete Acoo; 40462ed87e7eSStefano Zampini delete Bcoo; 40472ed87e7eSStefano Zampini delete Ccoo; 4048ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4049ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4050ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4051ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4052ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4053ed502f03SStefano Zampini #endif 40541a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4055ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4056ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4057ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4058ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4059ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4060ed502f03SStefano Zampini 40611a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 40621a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4063a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4064ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4065ed502f03SStefano Zampini CmatT->mat = CcsrT; 4066ed502f03SStefano Zampini CcsrT->num_rows = n; 4067ed502f03SStefano Zampini CcsrT->num_cols = m; 4068ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4069ed502f03SStefano Zampini 4070ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4071ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4072ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4073ed502f03SStefano Zampini 4074ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4075ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4076ed502f03SStefano Zampini if (AT) { 4077ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4078ed502f03SStefano Zampini thrust::advance(rT,-1); 4079ed502f03SStefano Zampini } 4080ed502f03SStefano Zampini if (BT) { 4081ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4082ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4083ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4084ed502f03SStefano Zampini } 4085ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4086ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4087ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4088ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4089ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4090ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4091ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 4092ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4093ed502f03SStefano Zampini 4094ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4095ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4096ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4097ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4098ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4099ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4100ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4101ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4102ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4103ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4104ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4105ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4106ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4107ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4108ed502f03SStefano Zampini #endif 4109ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4110ed502f03SStefano Zampini } 4111ed502f03SStefano Zampini } 4112ed502f03SStefano Zampini 4113ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4114ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4115ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 4116ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4117ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4118ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4119ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4120ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4121ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4122ed502f03SStefano Zampini jj = *Ccsr->column_indices; 4123ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4124ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4125ed502f03SStefano Zampini } else { 4126ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4127ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4128ed502f03SStefano Zampini } 4129ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4130ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4131ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4132ed502f03SStefano Zampini c->maxnz = c->nz; 4133ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4134ed502f03SStefano Zampini c->rmax = 0; 4135ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4136ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4137ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4138ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4139ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4140ed502f03SStefano Zampini } 4141ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4142ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4143ed502f03SStefano Zampini (*C)->nonzerostate++; 4144ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4145ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4146ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4147ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4148ed502f03SStefano Zampini } else { 4149ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4150ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4151ed502f03SStefano Zampini if (c->nz) { 4152ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4153ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4154ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4155ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4156ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4157ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4158ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4159ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4160ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4161ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4162ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4163ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4164ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4165ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4166ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4167ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4168ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4169ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 4170ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4171ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4172ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4173ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4174ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4175ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4176ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4177ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4178ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4179ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4180ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4181a49f1ed0SStefano Zampini ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 41821a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4183ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4184ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4185ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4186ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4187ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4188ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4189ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4190ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 41911a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4192ed502f03SStefano Zampini } 4193ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 4194ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4195ed502f03SStefano Zampini } 4196ed502f03SStefano Zampini } 4197ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4198ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4199ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4200ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4201ed502f03SStefano Zampini PetscFunctionReturn(0); 4202ed502f03SStefano Zampini } 4203c215019aSStefano Zampini 4204c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4205c215019aSStefano Zampini { 4206c215019aSStefano Zampini PetscErrorCode ierr; 4207c215019aSStefano Zampini bool dmem; 4208c215019aSStefano Zampini const PetscScalar *av; 4209c215019aSStefano Zampini cudaError_t cerr; 4210c215019aSStefano Zampini 4211c215019aSStefano Zampini PetscFunctionBegin; 4212c215019aSStefano Zampini dmem = isCudaMem(v); 4213c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4214c215019aSStefano Zampini if (n && idx) { 4215c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4216c215019aSStefano Zampini widx.assign(idx,idx+n); 4217c215019aSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4218c215019aSStefano Zampini 4219c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4220c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4221c215019aSStefano Zampini if (dmem) { 4222c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4223c215019aSStefano Zampini } else { 4224c215019aSStefano Zampini w = new THRUSTARRAY(n); 4225c215019aSStefano Zampini dv = w->data(); 4226c215019aSStefano Zampini } 4227c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4228c215019aSStefano Zampini 4229c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4230c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4231c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4232c215019aSStefano Zampini if (w) { 4233c215019aSStefano Zampini cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4234c215019aSStefano Zampini } 4235c215019aSStefano Zampini delete w; 4236c215019aSStefano Zampini } else { 4237c215019aSStefano Zampini cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4238c215019aSStefano Zampini } 4239c215019aSStefano Zampini if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4240c215019aSStefano Zampini ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4241c215019aSStefano Zampini PetscFunctionReturn(0); 4242c215019aSStefano Zampini } 4243*bddcd29dSMark Adams 4244*bddcd29dSMark Adams /* 4245*bddcd29dSMark Adams LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields) 4246*bddcd29dSMark Adams 4247*bddcd29dSMark Adams requires: 4248*bddcd29dSMark Adams structurally symmetric: fix with transpose/column meta data 4249*bddcd29dSMark Adams */ 4250*bddcd29dSMark Adams 4251*bddcd29dSMark Adams /* 4252*bddcd29dSMark Adams The GPU LU factor kernel 4253*bddcd29dSMark Adams */ 4254*bddcd29dSMark Adams __global__ 4255*bddcd29dSMark Adams void __launch_bounds__(1024,1) 4256*bddcd29dSMark Adams mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[]) 4257*bddcd29dSMark Adams { 4258*bddcd29dSMark Adams const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4259*bddcd29dSMark Adams const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4260*bddcd29dSMark Adams const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4261*bddcd29dSMark Adams 4262*bddcd29dSMark Adams // set i (row+1) 4263*bddcd29dSMark Adams if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero 4264*bddcd29dSMark Adams // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block 4265*bddcd29dSMark Adams for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4266*bddcd29dSMark Adams if (rowb < end_i && threadIdx.x==0) { 4267*bddcd29dSMark Adams PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0; 4268*bddcd29dSMark Adams bi_csr[rowb+1] = n1L + nug - clip + n2L + i; 4269*bddcd29dSMark Adams } 4270*bddcd29dSMark Adams } 4271*bddcd29dSMark Adams } 4272*bddcd29dSMark Adams // copy AIJ to AIJ_BAND 4273*bddcd29dSMark Adams __global__ 4274*bddcd29dSMark Adams void __launch_bounds__(1024,1) 4275*bddcd29dSMark Adams mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], 4276*bddcd29dSMark Adams const int ai_d[], const int aj_d[], const PetscScalar aa_d[], 4277*bddcd29dSMark Adams const int bi_csr[], PetscScalar ba_csr[]) 4278*bddcd29dSMark Adams { 4279*bddcd29dSMark Adams const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4280*bddcd29dSMark Adams const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4281*bddcd29dSMark Adams const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4282*bddcd29dSMark Adams 4283*bddcd29dSMark Adams // zero B 4284*bddcd29dSMark Adams if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end 4285*bddcd29dSMark Adams for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4286*bddcd29dSMark Adams if (rowb < end_i) { 4287*bddcd29dSMark Adams PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4288*bddcd29dSMark Adams const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb]; 4289*bddcd29dSMark Adams for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) { 4290*bddcd29dSMark Adams if (j<nzb) { 4291*bddcd29dSMark Adams batmp[j] = 0; 4292*bddcd29dSMark Adams } 4293*bddcd29dSMark Adams } 4294*bddcd29dSMark Adams } 4295*bddcd29dSMark Adams } 4296*bddcd29dSMark Adams 4297*bddcd29dSMark Adams // copy A into B with CSR format -- these two loops can be fused 4298*bddcd29dSMark Adams for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4299*bddcd29dSMark Adams if (rowb < end_i) { 4300*bddcd29dSMark Adams const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 4301*bddcd29dSMark Adams const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0; 4302*bddcd29dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 4303*bddcd29dSMark Adams PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4304*bddcd29dSMark Adams /* load in initial (unfactored row) */ 4305*bddcd29dSMark Adams for (int j=threadIdx.x ; j<nza ; j += blockDim.x) { 4306*bddcd29dSMark Adams if (j<nza) { 4307*bddcd29dSMark Adams PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart; 4308*bddcd29dSMark Adams PetscScalar vala = av[j]; 4309*bddcd29dSMark Adams batmp[idx] = vala; 4310*bddcd29dSMark Adams } 4311*bddcd29dSMark Adams } 4312*bddcd29dSMark Adams } 4313*bddcd29dSMark Adams } 4314*bddcd29dSMark Adams } 4315*bddcd29dSMark Adams // print AIJ_BAND 4316*bddcd29dSMark Adams __global__ 4317*bddcd29dSMark Adams void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[]) 4318*bddcd29dSMark Adams { 4319*bddcd29dSMark Adams // debug 4320*bddcd29dSMark Adams if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){ 4321*bddcd29dSMark Adams printf("B (AIJ) n=%d:\n",(int)n); 4322*bddcd29dSMark Adams for (int rowb=0;rowb<n;rowb++) { 4323*bddcd29dSMark Adams const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb]; 4324*bddcd29dSMark Adams const PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4325*bddcd29dSMark Adams for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j])); 4326*bddcd29dSMark Adams printf(" bi=%d\n",bi_csr[rowb+1]); 4327*bddcd29dSMark Adams } 4328*bddcd29dSMark Adams } 4329*bddcd29dSMark Adams } 4330*bddcd29dSMark Adams // Band LU kernel --- ba_csr bi_csr 4331*bddcd29dSMark Adams __global__ 4332*bddcd29dSMark Adams void __launch_bounds__(1024,1) 4333*bddcd29dSMark Adams mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[]) 4334*bddcd29dSMark Adams { 4335*bddcd29dSMark Adams extern __shared__ PetscInt smemInt[]; 4336*bddcd29dSMark Adams PetscInt *sm_pkIdx = &smemInt[0]; 4337*bddcd29dSMark Adams const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4338*bddcd29dSMark Adams const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4339*bddcd29dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 4340*bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4341*bddcd29dSMark Adams auto g = cooperative_groups::this_grid(); 4342*bddcd29dSMark Adams #endif 4343*bddcd29dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 4344*bddcd29dSMark Adams for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4345*bddcd29dSMark Adams PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears 4346*bddcd29dSMark Adams const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first 4347*bddcd29dSMark Adams const PetscInt nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y)); 4348*bddcd29dSMark Adams PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset; 4349*bddcd29dSMark Adams const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end) 4350*bddcd29dSMark Adams const PetscScalar Bdd = *pBdd; 4351*bddcd29dSMark Adams const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y; 4352*bddcd29dSMark Adams for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */ 4353*bddcd29dSMark Adams if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */ 4354*bddcd29dSMark Adams const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block 4355*bddcd29dSMark Adams PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4356*bddcd29dSMark Adams *Aid = *Aid/Bdd; 4357*bddcd29dSMark Adams sm_pkIdx[threadIdx.y] = kIdx; 4358*bddcd29dSMark Adams } 4359*bddcd29dSMark Adams __syncthreads(); // synch on threadIdx.x only 4360*bddcd29dSMark Adams if (idx < nzUd) { /* assuming symmetric structure */ 4361*bddcd29dSMark Adams PetscInt kIdx = sm_pkIdx[threadIdx.y]; 4362*bddcd29dSMark Adams PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4363*bddcd29dSMark Adams PetscScalar *Aij = Aid + 1; 4364*bddcd29dSMark Adams PetscScalar Lid = *Aid; 4365*bddcd29dSMark Adams for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) { 4366*bddcd29dSMark Adams if (jIdx<nzUd) { 4367*bddcd29dSMark Adams Aij[jIdx] -= Lid*baUd[jIdx]; 4368*bddcd29dSMark Adams } 4369*bddcd29dSMark Adams } 4370*bddcd29dSMark Adams } 4371*bddcd29dSMark Adams } 4372*bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4373*bddcd29dSMark Adams g.sync(); 4374*bddcd29dSMark Adams #else 4375*bddcd29dSMark Adams __syncthreads(); 4376*bddcd29dSMark Adams #endif 4377*bddcd29dSMark Adams } /* endof for (i=0; i<n; i++) { */ 4378*bddcd29dSMark Adams } 4379*bddcd29dSMark Adams 4380*bddcd29dSMark Adams static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec); 4381*bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info) 4382*bddcd29dSMark Adams { 4383*bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 4384*bddcd29dSMark Adams Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4385*bddcd29dSMark Adams if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 4386*bddcd29dSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 4387*bddcd29dSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA; 4388*bddcd29dSMark Adams CsrMatrix *matrixA; 4389*bddcd29dSMark Adams PetscErrorCode ierr; 4390*bddcd29dSMark Adams cudaError_t cerr; 4391*bddcd29dSMark Adams const PetscInt n=A->rmap->n, *ic, *r; 4392*bddcd29dSMark Adams const int *ai_d, *aj_d; 4393*bddcd29dSMark Adams const PetscScalar *aa_d; 4394*bddcd29dSMark Adams PetscScalar *ba_t = cusparseTriFactors->a_band_d; 4395*bddcd29dSMark Adams int *bi_t = cusparseTriFactors->i_band_d; 4396*bddcd29dSMark Adams PetscContainer container; 4397*bddcd29dSMark Adams int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1; 4398*bddcd29dSMark Adams 4399*bddcd29dSMark Adams PetscFunctionBegin; 4400*bddcd29dSMark Adams if (A->rmap->n == 0) { 4401*bddcd29dSMark Adams PetscFunctionReturn(0); 4402*bddcd29dSMark Adams } 4403*bddcd29dSMark Adams // cusparse setup 4404*bddcd29dSMark Adams if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA"); 4405*bddcd29dSMark Adams matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices 4406*bddcd29dSMark Adams if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 4407*bddcd29dSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 4408*bddcd29dSMark Adams if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat"); 4409*bddcd29dSMark Adams 4410*bddcd29dSMark Adams // factor: get Nf if available 4411*bddcd29dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4412*bddcd29dSMark Adams if (container) { 4413*bddcd29dSMark Adams PetscInt *pNf=NULL; 4414*bddcd29dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4415*bddcd29dSMark Adams Nf = (*pNf)%1000; 4416*bddcd29dSMark Adams if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use 4417*bddcd29dSMark Adams } else Nf = 1; 4418*bddcd29dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4419*bddcd29dSMark Adams 4420*bddcd29dSMark Adams // get data 4421*bddcd29dSMark Adams ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data()); 4422*bddcd29dSMark Adams ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 4423*bddcd29dSMark Adams aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data()); 4424*bddcd29dSMark Adams aa_d = thrust::raw_pointer_cast(matrixA->values->data().get()); 4425*bddcd29dSMark Adams r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data()); 4426*bddcd29dSMark Adams 4427*bddcd29dSMark Adams cerr = WaitForCUDA();CHKERRCUDA(cerr); 4428*bddcd29dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4429*bddcd29dSMark Adams { 4430*bddcd29dSMark Adams int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf; 4431*bddcd29dSMark Adams int gpuid; 4432*bddcd29dSMark Adams cudaDeviceProp prop; 4433*bddcd29dSMark Adams cudaGetDevice(&gpuid); 4434*bddcd29dSMark Adams cudaGetDeviceProperties(&prop, gpuid); 4435*bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 4436*bddcd29dSMark Adams Ni = 1/nconcurrent; 4437*bddcd29dSMark Adams Ni = 1; 4438*bddcd29dSMark Adams #else 4439*bddcd29dSMark Adams nsm = prop.multiProcessorCount; 4440*bddcd29dSMark Adams Ni = nsm/Nf/nconcurrent; 4441*bddcd29dSMark Adams #endif 4442*bddcd29dSMark Adams team_size = bw/Ni + !!(bw%Ni); 4443*bddcd29dSMark Adams nVec = PetscMin(bw, 1024/team_size); 4444*bddcd29dSMark Adams ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr); 4445*bddcd29dSMark Adams { 4446*bddcd29dSMark Adams dim3 dimBlockTeam(nVec,team_size); 4447*bddcd29dSMark Adams dim3 dimBlockLeague(Nf,Ni); 4448*bddcd29dSMark Adams mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t); 4449*bddcd29dSMark Adams CHECK_LAUNCH_ERROR(); // does a sync 4450*bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4451*bddcd29dSMark Adams void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t}; 4452*bddcd29dSMark Adams cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL); 4453*bddcd29dSMark Adams #else 4454*bddcd29dSMark Adams mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t); 4455*bddcd29dSMark Adams #endif 4456*bddcd29dSMark Adams CHECK_LAUNCH_ERROR(); // does a sync 4457*bddcd29dSMark Adams #if defined(PETSC_USE_LOG) 4458*bddcd29dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr); 4459*bddcd29dSMark Adams #endif 4460*bddcd29dSMark Adams } 4461*bddcd29dSMark Adams } 4462*bddcd29dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4463*bddcd29dSMark Adams 4464*bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */ 4465*bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND; 4466*bddcd29dSMark Adams B->ops->solvetranspose = NULL; // need transpose 4467*bddcd29dSMark Adams B->ops->matsolve = NULL; 4468*bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 4469*bddcd29dSMark Adams 4470*bddcd29dSMark Adams PetscFunctionReturn(0); 4471*bddcd29dSMark Adams } 4472*bddcd29dSMark Adams 4473*bddcd29dSMark Adams static PetscErrorCode MatrixNfDestroy(void *ptr) 4474*bddcd29dSMark Adams { 4475*bddcd29dSMark Adams PetscInt *nf = (PetscInt *)ptr; 4476*bddcd29dSMark Adams PetscErrorCode ierr; 4477*bddcd29dSMark Adams PetscFunctionBegin; 4478*bddcd29dSMark Adams ierr = PetscFree(nf);CHKERRQ(ierr); 4479*bddcd29dSMark Adams PetscFunctionReturn(0); 4480*bddcd29dSMark Adams } 4481*bddcd29dSMark Adams 4482*bddcd29dSMark Adams PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4483*bddcd29dSMark Adams { 4484*bddcd29dSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b; 4485*bddcd29dSMark Adams IS isicol; 4486*bddcd29dSMark Adams PetscErrorCode ierr; 4487*bddcd29dSMark Adams cudaError_t cerr; 4488*bddcd29dSMark Adams const PetscInt *ic,*ai=a->i,*aj=a->j; 4489*bddcd29dSMark Adams PetscScalar *ba_t; 4490*bddcd29dSMark Adams int *bi_t; 4491*bddcd29dSMark Adams PetscInt i,n=A->rmap->n,Nf; 4492*bddcd29dSMark Adams PetscInt nzBcsr,bwL,bwU; 4493*bddcd29dSMark Adams PetscBool missing; 4494*bddcd29dSMark Adams Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4495*bddcd29dSMark Adams PetscContainer container; 4496*bddcd29dSMark Adams 4497*bddcd29dSMark Adams PetscFunctionBegin; 4498*bddcd29dSMark Adams if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square"); 4499*bddcd29dSMark Adams ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr); 4500*bddcd29dSMark Adams if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i); 4501*bddcd29dSMark Adams if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors"); 4502*bddcd29dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr); 4503*bddcd29dSMark Adams if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported"); 4504*bddcd29dSMark Adams 4505*bddcd29dSMark Adams // factor: get Nf if available 4506*bddcd29dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4507*bddcd29dSMark Adams if (container) { 4508*bddcd29dSMark Adams PetscInt *pNf=NULL; 4509*bddcd29dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4510*bddcd29dSMark Adams Nf = (*pNf)%1000; 4511*bddcd29dSMark Adams ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr); 4512*bddcd29dSMark Adams ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr); 4513*bddcd29dSMark Adams *pNf = Nf; 4514*bddcd29dSMark Adams ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr); 4515*bddcd29dSMark Adams ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr); 4516*bddcd29dSMark Adams ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr); 4517*bddcd29dSMark Adams ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 4518*bddcd29dSMark Adams } else Nf = 1; 4519*bddcd29dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4520*bddcd29dSMark Adams 4521*bddcd29dSMark Adams ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4522*bddcd29dSMark Adams ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4523*bddcd29dSMark Adams 4524*bddcd29dSMark Adams ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4525*bddcd29dSMark Adams ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr); 4526*bddcd29dSMark Adams b = (Mat_SeqAIJ*)(B)->data; 4527*bddcd29dSMark Adams 4528*bddcd29dSMark Adams /* get band widths, MatComputeBandwidth should take a reordering ic and do this */ 4529*bddcd29dSMark Adams bwL = bwU = 0; 4530*bddcd29dSMark Adams for (int rwb=0; rwb<n; rwb++) { 4531*bddcd29dSMark Adams const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb]; 4532*bddcd29dSMark Adams for (int j=0;j<anz;j++) { 4533*bddcd29dSMark Adams PetscInt colb = ic[ajtmp[j]]; 4534*bddcd29dSMark Adams if (colb<rwa) { // L 4535*bddcd29dSMark Adams if (rwa-colb > bwL) bwL = rwa-colb; 4536*bddcd29dSMark Adams } else { 4537*bddcd29dSMark Adams if (colb-rwa > bwU) bwU = colb-rwa; 4538*bddcd29dSMark Adams } 4539*bddcd29dSMark Adams } 4540*bddcd29dSMark Adams } 4541*bddcd29dSMark Adams ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4542*bddcd29dSMark Adams /* only support structurally symmetric, but it might work */ 4543*bddcd29dSMark Adams if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU); 4544*bddcd29dSMark Adams ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 4545*bddcd29dSMark Adams nzBcsr = n + (2*n-1)*bwU - bwU*bwU; 4546*bddcd29dSMark Adams b->maxnz = b->nz = nzBcsr; 4547*bddcd29dSMark Adams cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz 4548*bddcd29dSMark Adams if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 4549*bddcd29dSMark Adams cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops 4550*bddcd29dSMark Adams cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr); 4551*bddcd29dSMark Adams cusparseTriFactors->a_band_d = ba_t; 4552*bddcd29dSMark Adams cusparseTriFactors->i_band_d = bi_t; 4553*bddcd29dSMark Adams /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */ 4554*bddcd29dSMark Adams ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr); 4555*bddcd29dSMark Adams { 4556*bddcd29dSMark Adams dim3 dimBlockTeam(1,128); 4557*bddcd29dSMark Adams dim3 dimBlockLeague(Nf,1); 4558*bddcd29dSMark Adams mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t); 4559*bddcd29dSMark Adams } 4560*bddcd29dSMark Adams CHECK_LAUNCH_ERROR(); // does a sync 4561*bddcd29dSMark Adams 4562*bddcd29dSMark Adams // setup data 4563*bddcd29dSMark Adams if (!cusparseTriFactors->rpermIndices) { 4564*bddcd29dSMark Adams const PetscInt *r; 4565*bddcd29dSMark Adams 4566*bddcd29dSMark Adams ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4567*bddcd29dSMark Adams cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 4568*bddcd29dSMark Adams cusparseTriFactors->rpermIndices->assign(r, r+n); 4569*bddcd29dSMark Adams ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4570*bddcd29dSMark Adams ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4571*bddcd29dSMark Adams } 4572*bddcd29dSMark Adams /* upper triangular indices */ 4573*bddcd29dSMark Adams if (!cusparseTriFactors->cpermIndices) { 4574*bddcd29dSMark Adams const PetscInt *c; 4575*bddcd29dSMark Adams 4576*bddcd29dSMark Adams ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr); 4577*bddcd29dSMark Adams cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 4578*bddcd29dSMark Adams cusparseTriFactors->cpermIndices->assign(c, c+n); 4579*bddcd29dSMark Adams ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr); 4580*bddcd29dSMark Adams ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4581*bddcd29dSMark Adams } 4582*bddcd29dSMark Adams 4583*bddcd29dSMark Adams /* put together the new matrix */ 4584*bddcd29dSMark Adams b->free_a = PETSC_FALSE; 4585*bddcd29dSMark Adams b->free_ij = PETSC_FALSE; 4586*bddcd29dSMark Adams b->singlemalloc = PETSC_FALSE; 4587*bddcd29dSMark Adams b->ilen = NULL; 4588*bddcd29dSMark Adams b->imax = NULL; 4589*bddcd29dSMark Adams b->row = isrow; 4590*bddcd29dSMark Adams b->col = iscol; 4591*bddcd29dSMark Adams ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4592*bddcd29dSMark Adams ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4593*bddcd29dSMark Adams b->icol = isicol; 4594*bddcd29dSMark Adams ierr = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr); 4595*bddcd29dSMark Adams 4596*bddcd29dSMark Adams B->factortype = MAT_FACTOR_LU; 4597*bddcd29dSMark Adams B->info.factor_mallocs = 0; 4598*bddcd29dSMark Adams B->info.fill_ratio_given = 0; 4599*bddcd29dSMark Adams 4600*bddcd29dSMark Adams if (ai[n]) { 4601*bddcd29dSMark Adams B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]); 4602*bddcd29dSMark Adams } else { 4603*bddcd29dSMark Adams B->info.fill_ratio_needed = 0.0; 4604*bddcd29dSMark Adams } 4605*bddcd29dSMark Adams #if defined(PETSC_USE_INFO) 4606*bddcd29dSMark Adams if (ai[n] != 0) { 4607*bddcd29dSMark Adams PetscReal af = B->info.fill_ratio_needed; 4608*bddcd29dSMark Adams ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr); 4609*bddcd29dSMark Adams } else { 4610*bddcd29dSMark Adams ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 4611*bddcd29dSMark Adams } 4612*bddcd29dSMark Adams #endif 4613*bddcd29dSMark Adams if (a->inode.size) { 4614*bddcd29dSMark Adams ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr); 4615*bddcd29dSMark Adams } 4616*bddcd29dSMark Adams ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr); 4617*bddcd29dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND; 4618*bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 4619*bddcd29dSMark Adams 4620*bddcd29dSMark Adams PetscFunctionReturn(0); 4621*bddcd29dSMark Adams } 4622*bddcd29dSMark Adams 4623*bddcd29dSMark Adams /* Use -pc_factor_mat_solver_type cusparseband */ 4624*bddcd29dSMark Adams PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type) 4625*bddcd29dSMark Adams { 4626*bddcd29dSMark Adams PetscFunctionBegin; 4627*bddcd29dSMark Adams *type = MATSOLVERCUSPARSEBAND; 4628*bddcd29dSMark Adams PetscFunctionReturn(0); 4629*bddcd29dSMark Adams } 4630*bddcd29dSMark Adams 4631*bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B) 4632*bddcd29dSMark Adams { 4633*bddcd29dSMark Adams PetscErrorCode ierr; 4634*bddcd29dSMark Adams PetscInt n = A->rmap->n; 4635*bddcd29dSMark Adams 4636*bddcd29dSMark Adams PetscFunctionBegin; 4637*bddcd29dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 4638*bddcd29dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 4639*bddcd29dSMark Adams (*B)->factortype = ftype; 4640*bddcd29dSMark Adams (*B)->useordering = PETSC_TRUE; 4641*bddcd29dSMark Adams ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4642*bddcd29dSMark Adams 4643*bddcd29dSMark Adams if (ftype == MAT_FACTOR_LU) { 4644*bddcd29dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 4645*bddcd29dSMark Adams (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 4646*bddcd29dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND; 4647*bddcd29dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types"); 4648*bddcd29dSMark Adams 4649*bddcd29dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4650*bddcd29dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr); 4651*bddcd29dSMark Adams PetscFunctionReturn(0); 4652*bddcd29dSMark Adams } 4653*bddcd29dSMark Adams 4654*bddcd29dSMark Adams #define WARP_SIZE 32 4655*bddcd29dSMark Adams template <typename T> 4656*bddcd29dSMark Adams __forceinline__ __device__ 4657*bddcd29dSMark Adams T wreduce(T a) 4658*bddcd29dSMark Adams { 4659*bddcd29dSMark Adams T b; 4660*bddcd29dSMark Adams #pragma unroll 4661*bddcd29dSMark Adams for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) { 4662*bddcd29dSMark Adams b = __shfl_down_sync(0xffffffff, a, i); 4663*bddcd29dSMark Adams a += b; 4664*bddcd29dSMark Adams } 4665*bddcd29dSMark Adams return a; 4666*bddcd29dSMark Adams } 4667*bddcd29dSMark Adams // reduce in a block, returns result in thread 0 4668*bddcd29dSMark Adams template <typename T, int BLOCK_SIZE> 4669*bddcd29dSMark Adams __device__ 4670*bddcd29dSMark Adams T breduce(T a) 4671*bddcd29dSMark Adams { 4672*bddcd29dSMark Adams constexpr int NWARP = BLOCK_SIZE/WARP_SIZE; 4673*bddcd29dSMark Adams __shared__ double buf[NWARP]; 4674*bddcd29dSMark Adams int wid = threadIdx.x / WARP_SIZE; 4675*bddcd29dSMark Adams int laneid = threadIdx.x % WARP_SIZE; 4676*bddcd29dSMark Adams T b = wreduce<T>(a); 4677*bddcd29dSMark Adams if (laneid == 0) 4678*bddcd29dSMark Adams buf[wid] = b; 4679*bddcd29dSMark Adams __syncthreads(); 4680*bddcd29dSMark Adams if (wid == 0) { 4681*bddcd29dSMark Adams if (threadIdx.x < NWARP) 4682*bddcd29dSMark Adams a = buf[threadIdx.x]; 4683*bddcd29dSMark Adams else 4684*bddcd29dSMark Adams a = 0; 4685*bddcd29dSMark Adams for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) { 4686*bddcd29dSMark Adams a += __shfl_down_sync(0xffffffff, a, i); 4687*bddcd29dSMark Adams } 4688*bddcd29dSMark Adams } 4689*bddcd29dSMark Adams return a; 4690*bddcd29dSMark Adams } 4691*bddcd29dSMark Adams 4692*bddcd29dSMark Adams 4693*bddcd29dSMark Adams // Band LU kernel --- ba_csr bi_csr 4694*bddcd29dSMark Adams template <int BLOCK_SIZE> 4695*bddcd29dSMark Adams __global__ 4696*bddcd29dSMark Adams void __launch_bounds__(256,1) 4697*bddcd29dSMark Adams mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[]) 4698*bddcd29dSMark Adams { 4699*bddcd29dSMark Adams const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz; 4700*bddcd29dSMark Adams const PetscScalar *pLi; 4701*bddcd29dSMark Adams const int tid = threadIdx.x; 4702*bddcd29dSMark Adams 4703*bddcd29dSMark Adams /* Next, solve L */ 4704*bddcd29dSMark Adams pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field 4705*bddcd29dSMark Adams for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4706*bddcd29dSMark Adams const PetscInt col = locDD<bw ? start : (glbDD-bw); 4707*bddcd29dSMark Adams PetscScalar t = 0; 4708*bddcd29dSMark Adams for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) { 4709*bddcd29dSMark Adams t += pLi[idx]*x[j]; 4710*bddcd29dSMark Adams } 4711*bddcd29dSMark Adams #if defined(PETSC_USE_COMPLEX) 4712*bddcd29dSMark Adams PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4713*bddcd29dSMark Adams PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4714*bddcd29dSMark Adams t = tt; 4715*bddcd29dSMark Adams #else 4716*bddcd29dSMark Adams t = breduce<PetscReal,BLOCK_SIZE>(t); 4717*bddcd29dSMark Adams #endif 4718*bddcd29dSMark Adams if (threadIdx.x == 0) 4719*bddcd29dSMark Adams x[glbDD] -= t; // /1.0 4720*bddcd29dSMark Adams __syncthreads(); 4721*bddcd29dSMark Adams // inc 4722*bddcd29dSMark Adams pLi += glbDD-col; // get to diagonal 4723*bddcd29dSMark Adams if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset 4724*bddcd29dSMark Adams else pLi += bw; 4725*bddcd29dSMark Adams pLi += 1; // skip to next row 4726*bddcd29dSMark Adams if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear) 4727*bddcd29dSMark Adams } 4728*bddcd29dSMark Adams /* Then, solve U */ 4729*bddcd29dSMark Adams pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal) 4730*bddcd29dSMark Adams if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row 4731*bddcd29dSMark Adams for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) { 4732*bddcd29dSMark Adams const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U 4733*bddcd29dSMark Adams PetscScalar t = 0; 4734*bddcd29dSMark Adams for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) { 4735*bddcd29dSMark Adams t += pLi[-idx]*x[j]; 4736*bddcd29dSMark Adams } 4737*bddcd29dSMark Adams #if defined(PETSC_USE_COMPLEX) 4738*bddcd29dSMark Adams PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4739*bddcd29dSMark Adams PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4740*bddcd29dSMark Adams t = tt; 4741*bddcd29dSMark Adams #else 4742*bddcd29dSMark Adams t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t)); 4743*bddcd29dSMark Adams #endif 4744*bddcd29dSMark Adams pLi -= col-glbDD; // diagonal 4745*bddcd29dSMark Adams if (threadIdx.x == 0) { 4746*bddcd29dSMark Adams x[glbDD] -= t; 4747*bddcd29dSMark Adams x[glbDD] /= pLi[0]; 4748*bddcd29dSMark Adams } 4749*bddcd29dSMark Adams __syncthreads(); 4750*bddcd29dSMark Adams // inc past L to start of previous U 4751*bddcd29dSMark Adams pLi -= bw+1; 4752*bddcd29dSMark Adams if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner 4753*bddcd29dSMark Adams if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner 4754*bddcd29dSMark Adams } 4755*bddcd29dSMark Adams } 4756*bddcd29dSMark Adams 4757*bddcd29dSMark Adams static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx) 4758*bddcd29dSMark Adams { 4759*bddcd29dSMark Adams const PetscScalar *barray; 4760*bddcd29dSMark Adams PetscScalar *xarray; 4761*bddcd29dSMark Adams thrust::device_ptr<const PetscScalar> bGPU; 4762*bddcd29dSMark Adams thrust::device_ptr<PetscScalar> xGPU; 4763*bddcd29dSMark Adams Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4764*bddcd29dSMark Adams THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 4765*bddcd29dSMark Adams PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf; 4766*bddcd29dSMark Adams PetscErrorCode ierr; 4767*bddcd29dSMark Adams cudaError_t cerr; 4768*bddcd29dSMark Adams PetscContainer container; 4769*bddcd29dSMark Adams 4770*bddcd29dSMark Adams PetscFunctionBegin; 4771*bddcd29dSMark Adams if (A->rmap->n == 0) { 4772*bddcd29dSMark Adams PetscFunctionReturn(0); 4773*bddcd29dSMark Adams } 4774*bddcd29dSMark Adams // factor: get Nf if available 4775*bddcd29dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4776*bddcd29dSMark Adams if (container) { 4777*bddcd29dSMark Adams PetscInt *pNf=NULL; 4778*bddcd29dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4779*bddcd29dSMark Adams Nf = (*pNf)%1000; 4780*bddcd29dSMark Adams } else Nf = 1; 4781*bddcd29dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4782*bddcd29dSMark Adams 4783*bddcd29dSMark Adams /* Get the GPU pointers */ 4784*bddcd29dSMark Adams ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 4785*bddcd29dSMark Adams ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 4786*bddcd29dSMark Adams xGPU = thrust::device_pointer_cast(xarray); 4787*bddcd29dSMark Adams bGPU = thrust::device_pointer_cast(barray); 4788*bddcd29dSMark Adams 4789*bddcd29dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4790*bddcd29dSMark Adams /* First, reorder with the row permutation */ 4791*bddcd29dSMark Adams thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 4792*bddcd29dSMark Adams thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 4793*bddcd29dSMark Adams tempGPU->begin()); 4794*bddcd29dSMark Adams constexpr int block = 128; 4795*bddcd29dSMark Adams mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get()); 4796*bddcd29dSMark Adams CHECK_LAUNCH_ERROR(); // does a sync 4797*bddcd29dSMark Adams 4798*bddcd29dSMark Adams /* Last, reorder with the column permutation */ 4799*bddcd29dSMark Adams thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 4800*bddcd29dSMark Adams thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 4801*bddcd29dSMark Adams xGPU); 4802*bddcd29dSMark Adams 4803*bddcd29dSMark Adams ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 4804*bddcd29dSMark Adams ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 4805*bddcd29dSMark Adams cerr = WaitForCUDA();CHKERRCUDA(cerr); 4806*bddcd29dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4807*bddcd29dSMark Adams ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 4808*bddcd29dSMark Adams PetscFunctionReturn(0); 4809*bddcd29dSMark Adams } 4810