19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3bc3f50f2SPaul Mullowney matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15bc3f50f2SPaul Mullowney 16e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 179ae82921SPaul Mullowney 18087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 21087f3262SPaul Mullowney 226fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 236fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 246fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 25087f3262SPaul Mullowney 266fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 286fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 304416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 316fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 326fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 336fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 346fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 359ae82921SPaul Mullowney 367f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 37470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 38470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 39470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 40470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 417f756511SDominic Meiser 42b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43b06137fdSPaul Mullowney { 44b06137fdSPaul Mullowney cusparseStatus_t stat; 45b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46b06137fdSPaul Mullowney 47b06137fdSPaul Mullowney PetscFunctionBegin; 48b06137fdSPaul Mullowney cusparsestruct->stream = stream; 49c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 50b06137fdSPaul Mullowney PetscFunctionReturn(0); 51b06137fdSPaul Mullowney } 52b06137fdSPaul Mullowney 53b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 54b06137fdSPaul Mullowney { 55b06137fdSPaul Mullowney cusparseStatus_t stat; 56b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 57b06137fdSPaul Mullowney 58b06137fdSPaul Mullowney PetscFunctionBegin; 596b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 6016a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 61c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 6216a2e217SAlejandro Lamas Daviña } 63b06137fdSPaul Mullowney cusparsestruct->handle = handle; 646b1cf21dSAlejandro Lamas Daviña } 65c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 66b06137fdSPaul Mullowney PetscFunctionReturn(0); 67b06137fdSPaul Mullowney } 68b06137fdSPaul Mullowney 69b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 70b06137fdSPaul Mullowney { 71b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 72b06137fdSPaul Mullowney PetscFunctionBegin; 73b06137fdSPaul Mullowney if (cusparsestruct->handle) 74b06137fdSPaul Mullowney cusparsestruct->handle = 0; 75b06137fdSPaul Mullowney PetscFunctionReturn(0); 76b06137fdSPaul Mullowney } 77b06137fdSPaul Mullowney 78ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 799ae82921SPaul Mullowney { 809ae82921SPaul Mullowney PetscFunctionBegin; 819ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 829ae82921SPaul Mullowney PetscFunctionReturn(0); 839ae82921SPaul Mullowney } 849ae82921SPaul Mullowney 85c708e6cdSJed Brown /*MC 86087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 87087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 88087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 89087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 90087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 91087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 92c708e6cdSJed Brown 939ae82921SPaul Mullowney Level: beginner 94c708e6cdSJed Brown 953ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 96c708e6cdSJed Brown M*/ 979ae82921SPaul Mullowney 9842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 999ae82921SPaul Mullowney { 1009ae82921SPaul Mullowney PetscErrorCode ierr; 101bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1029ae82921SPaul Mullowney 1039ae82921SPaul Mullowney PetscFunctionBegin; 104bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 105404133a2SPaul Mullowney (*B)->factortype = ftype; 106bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1079ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1082205254eSKarl Rupp 109087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 11033d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1119ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1129ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 113087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 114087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 115087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1169ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 117bc3f50f2SPaul Mullowney 118fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1193ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1209ae82921SPaul Mullowney PetscFunctionReturn(0); 1219ae82921SPaul Mullowney } 1229ae82921SPaul Mullowney 123bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 124ca45077fSPaul Mullowney { 125aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1266e111a19SKarl Rupp 127ca45077fSPaul Mullowney PetscFunctionBegin; 128ca45077fSPaul Mullowney switch (op) { 129e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 130aa372e3fSPaul Mullowney cusparsestruct->format = format; 131ca45077fSPaul Mullowney break; 132e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 133aa372e3fSPaul Mullowney cusparsestruct->format = format; 134ca45077fSPaul Mullowney break; 135ca45077fSPaul Mullowney default: 13636d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137ca45077fSPaul Mullowney } 138ca45077fSPaul Mullowney PetscFunctionReturn(0); 139ca45077fSPaul Mullowney } 1409ae82921SPaul Mullowney 141e057df02SPaul Mullowney /*@ 142e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 143e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 144aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 145e057df02SPaul Mullowney Not Collective 146e057df02SPaul Mullowney 147e057df02SPaul Mullowney Input Parameters: 1488468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 14936d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1502692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 151e057df02SPaul Mullowney 152e057df02SPaul Mullowney Output Parameter: 153e057df02SPaul Mullowney 154e057df02SPaul Mullowney Level: intermediate 155e057df02SPaul Mullowney 1568468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 157e057df02SPaul Mullowney @*/ 158e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 159e057df02SPaul Mullowney { 160e057df02SPaul Mullowney PetscErrorCode ierr; 1616e111a19SKarl Rupp 162e057df02SPaul Mullowney PetscFunctionBegin; 163e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 164e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 165e057df02SPaul Mullowney PetscFunctionReturn(0); 166e057df02SPaul Mullowney } 167e057df02SPaul Mullowney 1684416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 1699ae82921SPaul Mullowney { 1709ae82921SPaul Mullowney PetscErrorCode ierr; 171e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1729ae82921SPaul Mullowney PetscBool flg; 173a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1746e111a19SKarl Rupp 1759ae82921SPaul Mullowney PetscFunctionBegin; 176e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 1779ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 178e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 179a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 180e057df02SPaul Mullowney if (flg) { 181e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 182045c96e1SPaul Mullowney } 1839ae82921SPaul Mullowney } 1844c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 185a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 1864c87dfd4SPaul Mullowney if (flg) { 1874c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 1884c87dfd4SPaul Mullowney } 1890af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 1909ae82921SPaul Mullowney PetscFunctionReturn(0); 1919ae82921SPaul Mullowney 1929ae82921SPaul Mullowney } 1939ae82921SPaul Mullowney 1946fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1959ae82921SPaul Mullowney { 1969ae82921SPaul Mullowney PetscErrorCode ierr; 1979ae82921SPaul Mullowney 1989ae82921SPaul Mullowney PetscFunctionBegin; 1999ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2009ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2019ae82921SPaul Mullowney PetscFunctionReturn(0); 2029ae82921SPaul Mullowney } 2039ae82921SPaul Mullowney 2046fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2059ae82921SPaul Mullowney { 2069ae82921SPaul Mullowney PetscErrorCode ierr; 2079ae82921SPaul Mullowney 2089ae82921SPaul Mullowney PetscFunctionBegin; 2099ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2109ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2119ae82921SPaul Mullowney PetscFunctionReturn(0); 2129ae82921SPaul Mullowney } 2139ae82921SPaul Mullowney 214087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 215087f3262SPaul Mullowney { 216087f3262SPaul Mullowney PetscErrorCode ierr; 217087f3262SPaul Mullowney 218087f3262SPaul Mullowney PetscFunctionBegin; 219087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 220087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 221087f3262SPaul Mullowney PetscFunctionReturn(0); 222087f3262SPaul Mullowney } 223087f3262SPaul Mullowney 224087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 225087f3262SPaul Mullowney { 226087f3262SPaul Mullowney PetscErrorCode ierr; 227087f3262SPaul Mullowney 228087f3262SPaul Mullowney PetscFunctionBegin; 229087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 230087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 231087f3262SPaul Mullowney PetscFunctionReturn(0); 232087f3262SPaul Mullowney } 233087f3262SPaul Mullowney 234087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2359ae82921SPaul Mullowney { 2369ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2379ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2389ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 239aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 2409ae82921SPaul Mullowney cusparseStatus_t stat; 2419ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2429ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2439ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2449ae82921SPaul Mullowney PetscScalar *AALo; 2459ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 246b175d8bbSPaul Mullowney PetscErrorCode ierr; 2479ae82921SPaul Mullowney 2489ae82921SPaul Mullowney PetscFunctionBegin; 249cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 250b8ced49eSKarl Rupp if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 2519ae82921SPaul Mullowney try { 2529ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2539ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2549ae82921SPaul Mullowney 2559ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 256c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 257c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 258c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 2599ae82921SPaul Mullowney 2609ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2619ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2629ae82921SPaul Mullowney AiLo[n] = nzLower; 2639ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2649ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2659ae82921SPaul Mullowney v = aa; 2669ae82921SPaul Mullowney vi = aj; 2679ae82921SPaul Mullowney offset = 1; 2689ae82921SPaul Mullowney rowOffset= 1; 2699ae82921SPaul Mullowney for (i=1; i<n; i++) { 2709ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 271e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2729ae82921SPaul Mullowney AiLo[i] = rowOffset; 2739ae82921SPaul Mullowney rowOffset += nz+1; 2749ae82921SPaul Mullowney 275580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 276580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 2779ae82921SPaul Mullowney 2789ae82921SPaul Mullowney offset += nz; 2799ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 2809ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 2819ae82921SPaul Mullowney offset += 1; 2829ae82921SPaul Mullowney 2839ae82921SPaul Mullowney v += nz; 2849ae82921SPaul Mullowney vi += nz; 2859ae82921SPaul Mullowney } 2862205254eSKarl Rupp 287aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 288aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 2892205254eSKarl Rupp 290aa372e3fSPaul Mullowney /* Create the matrix description */ 291c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 292c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 293c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 294c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 295c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 296aa372e3fSPaul Mullowney 297aa372e3fSPaul Mullowney /* Create the solve analysis information */ 298c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 299aa372e3fSPaul Mullowney 300aa372e3fSPaul Mullowney /* set the operation */ 301aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 302aa372e3fSPaul Mullowney 303aa372e3fSPaul Mullowney /* set the matrix */ 304aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 305aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 306aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 307aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 308aa372e3fSPaul Mullowney 309aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 310aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 311aa372e3fSPaul Mullowney 312aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 313aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 314aa372e3fSPaul Mullowney 315aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 316aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 317aa372e3fSPaul Mullowney 318aa372e3fSPaul Mullowney /* perform the solve analysis */ 319aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 320aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 321aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 322c41cb2e2SAlejandro Lamas Daviña loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 323aa372e3fSPaul Mullowney 324aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 325aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 3262205254eSKarl Rupp 327c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 328c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 329c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 3304863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 3319ae82921SPaul Mullowney } catch(char *ex) { 3329ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3339ae82921SPaul Mullowney } 3349ae82921SPaul Mullowney } 3359ae82921SPaul Mullowney PetscFunctionReturn(0); 3369ae82921SPaul Mullowney } 3379ae82921SPaul Mullowney 338087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 3399ae82921SPaul Mullowney { 3409ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3419ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3429ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 343aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 3449ae82921SPaul Mullowney cusparseStatus_t stat; 3459ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 3469ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3479ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3489ae82921SPaul Mullowney PetscScalar *AAUp; 3499ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 3509ae82921SPaul Mullowney PetscErrorCode ierr; 3519ae82921SPaul Mullowney 3529ae82921SPaul Mullowney PetscFunctionBegin; 353cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 354b8ced49eSKarl Rupp if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 3559ae82921SPaul Mullowney try { 3569ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3579ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 3589ae82921SPaul Mullowney 3599ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 360c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 361c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 362c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 3639ae82921SPaul Mullowney 3649ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 3659ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 3669ae82921SPaul Mullowney AiUp[n]=nzUpper; 3679ae82921SPaul Mullowney offset = nzUpper; 3689ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 3699ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 3709ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 3719ae82921SPaul Mullowney 372e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 3739ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 3749ae82921SPaul Mullowney 375e057df02SPaul Mullowney /* decrement the offset */ 3769ae82921SPaul Mullowney offset -= (nz+1); 3779ae82921SPaul Mullowney 378e057df02SPaul Mullowney /* first, set the diagonal elements */ 3799ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 38009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 3819ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 3829ae82921SPaul Mullowney 383580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 384580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 3859ae82921SPaul Mullowney } 3862205254eSKarl Rupp 387aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 388aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3892205254eSKarl Rupp 390aa372e3fSPaul Mullowney /* Create the matrix description */ 391c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 392c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 393c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 394c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 395c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 396aa372e3fSPaul Mullowney 397aa372e3fSPaul Mullowney /* Create the solve analysis information */ 398c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 399aa372e3fSPaul Mullowney 400aa372e3fSPaul Mullowney /* set the operation */ 401aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 402aa372e3fSPaul Mullowney 403aa372e3fSPaul Mullowney /* set the matrix */ 404aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 405aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 406aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 407aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 408aa372e3fSPaul Mullowney 409aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 410aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 411aa372e3fSPaul Mullowney 412aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 413aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 414aa372e3fSPaul Mullowney 415aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 416aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 417aa372e3fSPaul Mullowney 418aa372e3fSPaul Mullowney /* perform the solve analysis */ 419aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 420aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 421aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 422c41cb2e2SAlejandro Lamas Daviña upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 423aa372e3fSPaul Mullowney 424aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 425aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 4262205254eSKarl Rupp 427c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 428c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 429c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 4304863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 4319ae82921SPaul Mullowney } catch(char *ex) { 4329ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4339ae82921SPaul Mullowney } 4349ae82921SPaul Mullowney } 4359ae82921SPaul Mullowney PetscFunctionReturn(0); 4369ae82921SPaul Mullowney } 4379ae82921SPaul Mullowney 438087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 4399ae82921SPaul Mullowney { 4409ae82921SPaul Mullowney PetscErrorCode ierr; 4419ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4429ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4439ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 4449ae82921SPaul Mullowney PetscBool row_identity,col_identity; 4459ae82921SPaul Mullowney const PetscInt *r,*c; 4469ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4479ae82921SPaul Mullowney 4489ae82921SPaul Mullowney PetscFunctionBegin; 449087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 450087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 4512205254eSKarl Rupp 452e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 453aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 4549ae82921SPaul Mullowney 455b8ced49eSKarl Rupp A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 456e057df02SPaul Mullowney /* lower triangular indices */ 4579ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4589ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4592205254eSKarl Rupp if (!row_identity) { 460aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 461aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 4622205254eSKarl Rupp } 4639ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4649ae82921SPaul Mullowney 465e057df02SPaul Mullowney /* upper triangular indices */ 4669ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 4679ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4682205254eSKarl Rupp if (!col_identity) { 469aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 470aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 4712205254eSKarl Rupp } 4724863603aSSatish Balay 4734863603aSSatish Balay if (!row_identity && !col_identity) { 4744863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 4754863603aSSatish Balay } else if(!row_identity) { 4764863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4774863603aSSatish Balay } else if(!col_identity) { 4784863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4794863603aSSatish Balay } 4804863603aSSatish Balay 4819ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 4829ae82921SPaul Mullowney PetscFunctionReturn(0); 4839ae82921SPaul Mullowney } 4849ae82921SPaul Mullowney 485087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 486087f3262SPaul Mullowney { 487087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 488087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 489aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 490aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 491087f3262SPaul Mullowney cusparseStatus_t stat; 492087f3262SPaul Mullowney PetscErrorCode ierr; 493087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 494087f3262SPaul Mullowney PetscScalar *AAUp; 495087f3262SPaul Mullowney PetscScalar *AALo; 496087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 497087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 498087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 499087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 500087f3262SPaul Mullowney 501087f3262SPaul Mullowney PetscFunctionBegin; 502cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 503b8ced49eSKarl Rupp if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 504087f3262SPaul Mullowney try { 505087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 506c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 507c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 508c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 509c41cb2e2SAlejandro Lamas Daviña ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510087f3262SPaul Mullowney 511087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 512087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 513087f3262SPaul Mullowney AiUp[n]=nzUpper; 514087f3262SPaul Mullowney offset = 0; 515087f3262SPaul Mullowney for (i=0; i<n; i++) { 516087f3262SPaul Mullowney /* set the pointers */ 517087f3262SPaul Mullowney v = aa + ai[i]; 518087f3262SPaul Mullowney vj = aj + ai[i]; 519087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 520087f3262SPaul Mullowney 521087f3262SPaul Mullowney /* first, set the diagonal elements */ 522087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 52309f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 524087f3262SPaul Mullowney AiUp[i] = offset; 52509f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 526087f3262SPaul Mullowney 527087f3262SPaul Mullowney offset+=1; 528087f3262SPaul Mullowney if (nz>0) { 529f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 530580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 531087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 532087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 533087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 534087f3262SPaul Mullowney } 535087f3262SPaul Mullowney offset+=nz; 536087f3262SPaul Mullowney } 537087f3262SPaul Mullowney } 538087f3262SPaul Mullowney 539aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 540aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 541087f3262SPaul Mullowney 542aa372e3fSPaul Mullowney /* Create the matrix description */ 543c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 544c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 545c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 546c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 547c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 548087f3262SPaul Mullowney 549aa372e3fSPaul Mullowney /* Create the solve analysis information */ 550c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 551aa372e3fSPaul Mullowney 552aa372e3fSPaul Mullowney /* set the operation */ 553aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 554aa372e3fSPaul Mullowney 555aa372e3fSPaul Mullowney /* set the matrix */ 556aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 557aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 558aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 559aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 560aa372e3fSPaul Mullowney 561aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 562aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 563aa372e3fSPaul Mullowney 564aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 565aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 566aa372e3fSPaul Mullowney 567aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 568aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 569aa372e3fSPaul Mullowney 570aa372e3fSPaul Mullowney /* perform the solve analysis */ 571aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 572aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 573aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 574c41cb2e2SAlejandro Lamas Daviña upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 575aa372e3fSPaul Mullowney 576aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 577aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 578aa372e3fSPaul Mullowney 579aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 580aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 581aa372e3fSPaul Mullowney 582aa372e3fSPaul Mullowney /* Create the matrix description */ 583c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 584c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 585c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 586c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 587c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 588aa372e3fSPaul Mullowney 589aa372e3fSPaul Mullowney /* Create the solve analysis information */ 590c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 591aa372e3fSPaul Mullowney 592aa372e3fSPaul Mullowney /* set the operation */ 593aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 594aa372e3fSPaul Mullowney 595aa372e3fSPaul Mullowney /* set the matrix */ 596aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 597aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 598aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 599aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 600aa372e3fSPaul Mullowney 601aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 602aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 603aa372e3fSPaul Mullowney 604aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 605aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 606aa372e3fSPaul Mullowney 607aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 608aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 6094863603aSSatish Balay ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 610aa372e3fSPaul Mullowney 611aa372e3fSPaul Mullowney /* perform the solve analysis */ 612aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 613aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 614aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 615c41cb2e2SAlejandro Lamas Daviña loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 616aa372e3fSPaul Mullowney 617aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 618aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 619087f3262SPaul Mullowney 620b8ced49eSKarl Rupp A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 621c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 622c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 623c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 624c41cb2e2SAlejandro Lamas Daviña ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 625087f3262SPaul Mullowney } catch(char *ex) { 626087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 627087f3262SPaul Mullowney } 628087f3262SPaul Mullowney } 629087f3262SPaul Mullowney PetscFunctionReturn(0); 630087f3262SPaul Mullowney } 631087f3262SPaul Mullowney 632087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 6339ae82921SPaul Mullowney { 6349ae82921SPaul Mullowney PetscErrorCode ierr; 635087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 636087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 637087f3262SPaul Mullowney IS ip = a->row; 638087f3262SPaul Mullowney const PetscInt *rip; 639087f3262SPaul Mullowney PetscBool perm_identity; 640087f3262SPaul Mullowney PetscInt n = A->rmap->n; 641087f3262SPaul Mullowney 642087f3262SPaul Mullowney PetscFunctionBegin; 643087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 644e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 645aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 646aa372e3fSPaul Mullowney 647087f3262SPaul Mullowney /* lower triangular indices */ 648087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 649087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 650087f3262SPaul Mullowney if (!perm_identity) { 651*4e4bbfaaSStefano Zampini IS iip; 652*4e4bbfaaSStefano Zampini const PetscInt *irip; 653*4e4bbfaaSStefano Zampini 654*4e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 655*4e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 656aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 657aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 658aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 659*4e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 660*4e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 661*4e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 6624863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 663087f3262SPaul Mullowney } 664087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 665087f3262SPaul Mullowney PetscFunctionReturn(0); 666087f3262SPaul Mullowney } 667087f3262SPaul Mullowney 6686fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 6699ae82921SPaul Mullowney { 6709ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 6719ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 6729ae82921SPaul Mullowney PetscBool row_identity,col_identity; 673b175d8bbSPaul Mullowney PetscErrorCode ierr; 6749ae82921SPaul Mullowney 6759ae82921SPaul Mullowney PetscFunctionBegin; 6769ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 677e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 6789ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6799ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 680bda325fcSPaul Mullowney if (row_identity && col_identity) { 681bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 682bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 683*4e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 684*4e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 685bda325fcSPaul Mullowney } else { 686bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 687bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 688*4e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 689*4e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 690bda325fcSPaul Mullowney } 6918dc1d2a3SPaul Mullowney 692e057df02SPaul Mullowney /* get the triangular factors */ 693087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 6949ae82921SPaul Mullowney PetscFunctionReturn(0); 6959ae82921SPaul Mullowney } 6969ae82921SPaul Mullowney 697087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 698087f3262SPaul Mullowney { 699087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 700087f3262SPaul Mullowney IS ip = b->row; 701087f3262SPaul Mullowney PetscBool perm_identity; 702b175d8bbSPaul Mullowney PetscErrorCode ierr; 703087f3262SPaul Mullowney 704087f3262SPaul Mullowney PetscFunctionBegin; 705087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 706087f3262SPaul Mullowney 707087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 708087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 709087f3262SPaul Mullowney if (perm_identity) { 710087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 711087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 712*4e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 713*4e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 714087f3262SPaul Mullowney } else { 715087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 716087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 717*4e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 718*4e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 719087f3262SPaul Mullowney } 720087f3262SPaul Mullowney 721087f3262SPaul Mullowney /* get the triangular factors */ 722087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 723087f3262SPaul Mullowney PetscFunctionReturn(0); 724087f3262SPaul Mullowney } 7259ae82921SPaul Mullowney 726b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 727bda325fcSPaul Mullowney { 728bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 729aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 730aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 731aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 732aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 733bda325fcSPaul Mullowney cusparseStatus_t stat; 734aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 735aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 736aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 737aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 738b175d8bbSPaul Mullowney 739bda325fcSPaul Mullowney PetscFunctionBegin; 740bda325fcSPaul Mullowney 741aa372e3fSPaul Mullowney /*********************************************/ 742aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 743aa372e3fSPaul Mullowney /*********************************************/ 744aa372e3fSPaul Mullowney 745aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 746aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 747aa372e3fSPaul Mullowney 748aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 749aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 750aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 751aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 752aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 753aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 754aa372e3fSPaul Mullowney 755aa372e3fSPaul Mullowney /* Create the matrix description */ 756c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 757c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 758c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 759c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 760c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 761aa372e3fSPaul Mullowney 762aa372e3fSPaul Mullowney /* Create the solve analysis information */ 763c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 764aa372e3fSPaul Mullowney 765aa372e3fSPaul Mullowney /* set the operation */ 766aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 767aa372e3fSPaul Mullowney 768aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 769aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 770aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 771aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 772aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 773aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 774aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 775aa372e3fSPaul Mullowney loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 776aa372e3fSPaul Mullowney 777aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 778aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 779aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 780aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 781aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 782aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 783aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 784aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 785aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 786c41cb2e2SAlejandro Lamas Daviña CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 787aa372e3fSPaul Mullowney 788aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 789aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 790aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 791aa372e3fSPaul Mullowney loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 792aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 793c41cb2e2SAlejandro Lamas Daviña loTriFactorT->solveInfo);CHKERRCUDA(stat); 794aa372e3fSPaul Mullowney 795aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 796aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 797aa372e3fSPaul Mullowney 798aa372e3fSPaul Mullowney /*********************************************/ 799aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 800aa372e3fSPaul Mullowney /*********************************************/ 801aa372e3fSPaul Mullowney 802aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 803aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 804aa372e3fSPaul Mullowney 805aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 806aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 807aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 808aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 809aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 810aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 811aa372e3fSPaul Mullowney 812aa372e3fSPaul Mullowney /* Create the matrix description */ 813c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 814c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 815c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 816c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 817c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 818aa372e3fSPaul Mullowney 819aa372e3fSPaul Mullowney /* Create the solve analysis information */ 820c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 821aa372e3fSPaul Mullowney 822aa372e3fSPaul Mullowney /* set the operation */ 823aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 824aa372e3fSPaul Mullowney 825aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 826aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 827aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 828aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 829aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 830aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 831aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 832aa372e3fSPaul Mullowney upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 833aa372e3fSPaul Mullowney 834aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 835aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 836aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 837aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 838aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 839aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 840aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 841aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 842aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 843c41cb2e2SAlejandro Lamas Daviña CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 846aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 847aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 848aa372e3fSPaul Mullowney upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 849aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 850c41cb2e2SAlejandro Lamas Daviña upTriFactorT->solveInfo);CHKERRCUDA(stat); 851aa372e3fSPaul Mullowney 852aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 853aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 854bda325fcSPaul Mullowney PetscFunctionReturn(0); 855bda325fcSPaul Mullowney } 856bda325fcSPaul Mullowney 857b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 858bda325fcSPaul Mullowney { 859aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 860aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 861aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 862bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 863bda325fcSPaul Mullowney cusparseStatus_t stat; 864aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 865b06137fdSPaul Mullowney cudaError_t err; 8664863603aSSatish Balay PetscErrorCode ierr; 867b175d8bbSPaul Mullowney 868bda325fcSPaul Mullowney PetscFunctionBegin; 869aa372e3fSPaul Mullowney 870aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 871aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 872c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 873aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 874c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 875c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 876aa372e3fSPaul Mullowney 877b06137fdSPaul Mullowney /* set alpha and beta */ 878c41cb2e2SAlejandro Lamas Daviña err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 8797656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 8807656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 8817656d835SStefano Zampini err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 8827656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 8837656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 884c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 885b06137fdSPaul Mullowney 886aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 887aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 888aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 889a3fdcf43SKarl Rupp thrust::device_vector<int> *rowoffsets_gpu; 890a3fdcf43SKarl Rupp const int ncomprow = matstruct->cprowIndices->size(); 891a3fdcf43SKarl Rupp thrust::host_vector<int> cprow(ncomprow); 892a3fdcf43SKarl Rupp cprow = *matstruct->cprowIndices; // GPU --> CPU 893554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 894554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 895aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 896a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 897aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 899a3fdcf43SKarl Rupp PetscInt k,i; 900a3fdcf43SKarl Rupp thrust::host_vector<int> rowst_host(A->rmap->n+1); 901a3fdcf43SKarl Rupp 902a3fdcf43SKarl Rupp /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 903a3fdcf43SKarl Rupp rowst_host[0] = 0; 904a3fdcf43SKarl Rupp for (k = 0, i = 0; i < A->rmap->n ; i++) { 905a3fdcf43SKarl Rupp if (k < ncomprow && i==cprow[k]) { 906a3fdcf43SKarl Rupp rowst_host[i+1] = a->i[i+1]; 907a3fdcf43SKarl Rupp k++; 908a3fdcf43SKarl Rupp } else rowst_host[i+1] = rowst_host[i]; 909a3fdcf43SKarl Rupp } 910a3fdcf43SKarl Rupp if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 911a3fdcf43SKarl Rupp rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 912a3fdcf43SKarl Rupp *rowoffsets_gpu = rowst_host; // CPU --> GPU 913aa372e3fSPaul Mullowney 914aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 915aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 916a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 917a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 918aa372e3fSPaul Mullowney matrix->values->data().get(), 919a3fdcf43SKarl Rupp rowoffsets_gpu->data().get(), 920aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 921aa372e3fSPaul Mullowney matrixT->values->data().get(), 922aa372e3fSPaul Mullowney matrixT->column_indices->data().get(), 923aa372e3fSPaul Mullowney matrixT->row_offsets->data().get(), 924c41cb2e2SAlejandro Lamas Daviña CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 925aa372e3fSPaul Mullowney 926aa372e3fSPaul Mullowney /* assign the pointer */ 927aa372e3fSPaul Mullowney matstructT->mat = matrixT; 9284863603aSSatish Balay ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 929aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 930aa372e3fSPaul Mullowney /* First convert HYB to CSR */ 931aa372e3fSPaul Mullowney CsrMatrix *temp= new CsrMatrix; 932aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 933aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 934aa372e3fSPaul Mullowney temp->num_entries = a->nz; 935aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 936aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 937aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 938aa372e3fSPaul Mullowney 9392692e278SPaul Mullowney 940aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 941aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 942aa372e3fSPaul Mullowney temp->values->data().get(), 943aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 944c41cb2e2SAlejandro Lamas Daviña temp->column_indices->data().get());CHKERRCUDA(stat); 945aa372e3fSPaul Mullowney 946aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 947aa372e3fSPaul Mullowney CsrMatrix *tempT= new CsrMatrix; 948aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 949aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 950aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 951aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 952aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 953aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 954aa372e3fSPaul Mullowney 955aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 956aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 957aa372e3fSPaul Mullowney temp->values->data().get(), 958aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 959aa372e3fSPaul Mullowney temp->column_indices->data().get(), 960aa372e3fSPaul Mullowney tempT->values->data().get(), 961aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 962aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 963c41cb2e2SAlejandro Lamas Daviña CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 964aa372e3fSPaul Mullowney 965aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 966aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 967c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 968aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 969aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 970aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 971aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 972aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 973aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 974c41cb2e2SAlejandro Lamas Daviña hybMat, 0, partition);CHKERRCUDA(stat); 975aa372e3fSPaul Mullowney 976aa372e3fSPaul Mullowney /* assign the pointer */ 977aa372e3fSPaul Mullowney matstructT->mat = hybMat; 9784863603aSSatish Balay ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 979aa372e3fSPaul Mullowney 980aa372e3fSPaul Mullowney /* delete temporaries */ 981aa372e3fSPaul Mullowney if (tempT) { 982aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 983aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 984aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 985aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 986087f3262SPaul Mullowney } 987aa372e3fSPaul Mullowney if (temp) { 988aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 989aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 990aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 991aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 992aa372e3fSPaul Mullowney } 993aa372e3fSPaul Mullowney } 994aa372e3fSPaul Mullowney /* assign the compressed row indices */ 995aa372e3fSPaul Mullowney matstructT->cprowIndices = new THRUSTINTARRAY; 996554b8892SKarl Rupp matstructT->cprowIndices->resize(A->cmap->n); 997554b8892SKarl Rupp thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 998aa372e3fSPaul Mullowney /* assign the pointer */ 999aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1000bda325fcSPaul Mullowney PetscFunctionReturn(0); 1001bda325fcSPaul Mullowney } 1002bda325fcSPaul Mullowney 1003*4e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 10046fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1005bda325fcSPaul Mullowney { 1006c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1007465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1008465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1009465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1010465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1011bda325fcSPaul Mullowney cusparseStatus_t stat; 1012bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1013aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1014aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1015aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1016b175d8bbSPaul Mullowney PetscErrorCode ierr; 1017bda325fcSPaul Mullowney 1018bda325fcSPaul Mullowney PetscFunctionBegin; 1019aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1020aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1021bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1022aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1023aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1024bda325fcSPaul Mullowney } 1025bda325fcSPaul Mullowney 1026bda325fcSPaul Mullowney /* Get the GPU pointers */ 1027c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1028c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1029c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1030c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1031bda325fcSPaul Mullowney 10327a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1033aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1034c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1035c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1036c41cb2e2SAlejandro Lamas Daviña xGPU); 1037aa372e3fSPaul Mullowney 1038aa372e3fSPaul Mullowney /* First, solve U */ 1039aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 10407656d835SStefano Zampini upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1041aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1042aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1043aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1044aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1045c41cb2e2SAlejandro Lamas Daviña xarray, tempGPU->data().get());CHKERRCUDA(stat); 1046aa372e3fSPaul Mullowney 1047aa372e3fSPaul Mullowney /* Then, solve L */ 1048aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 10497656d835SStefano Zampini loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1050aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1051aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1052aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1053aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1054c41cb2e2SAlejandro Lamas Daviña tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1055aa372e3fSPaul Mullowney 1056aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1057c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1058c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1059aa372e3fSPaul Mullowney tempGPU->begin()); 1060aa372e3fSPaul Mullowney 1061aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1062c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1063bda325fcSPaul Mullowney 1064bda325fcSPaul Mullowney /* restore */ 1065c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1066c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1067c41cb2e2SAlejandro Lamas Daviña ierr = WaitForGPU();CHKERRCUDA(ierr); 1068661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1069958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1070bda325fcSPaul Mullowney PetscFunctionReturn(0); 1071bda325fcSPaul Mullowney } 1072bda325fcSPaul Mullowney 10736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1074bda325fcSPaul Mullowney { 1075465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1076465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1077bda325fcSPaul Mullowney cusparseStatus_t stat; 1078bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1079aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1080aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1081aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1082b175d8bbSPaul Mullowney PetscErrorCode ierr; 1083bda325fcSPaul Mullowney 1084bda325fcSPaul Mullowney PetscFunctionBegin; 1085aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1086aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1087bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1088aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1089aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1090bda325fcSPaul Mullowney } 1091bda325fcSPaul Mullowney 1092bda325fcSPaul Mullowney /* Get the GPU pointers */ 1093c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1094c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1095bda325fcSPaul Mullowney 10967a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1097aa372e3fSPaul Mullowney /* First, solve U */ 1098aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 10997656d835SStefano Zampini upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1100aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1101aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1102aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1103aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1104c41cb2e2SAlejandro Lamas Daviña barray, tempGPU->data().get());CHKERRCUDA(stat); 1105aa372e3fSPaul Mullowney 1106aa372e3fSPaul Mullowney /* Then, solve L */ 1107aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 11087656d835SStefano Zampini loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1109aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1110aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1111aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1112aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1113c41cb2e2SAlejandro Lamas Daviña tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1114bda325fcSPaul Mullowney 1115bda325fcSPaul Mullowney /* restore */ 1116c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1117c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1118c41cb2e2SAlejandro Lamas Daviña ierr = WaitForGPU();CHKERRCUDA(ierr); 1119661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1120958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1121bda325fcSPaul Mullowney PetscFunctionReturn(0); 1122bda325fcSPaul Mullowney } 1123bda325fcSPaul Mullowney 11246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 11259ae82921SPaul Mullowney { 1126465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1127465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1128465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1129465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 11309ae82921SPaul Mullowney cusparseStatus_t stat; 11319ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1132aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1133aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1134aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1135b175d8bbSPaul Mullowney PetscErrorCode ierr; 11369ae82921SPaul Mullowney 11379ae82921SPaul Mullowney PetscFunctionBegin; 1138ebc8f436SDominic Meiser 1139e057df02SPaul Mullowney /* Get the GPU pointers */ 1140c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1141c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1142c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1143c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 11449ae82921SPaul Mullowney 11457a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1146aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1147c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1148c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1149*4e4bbfaaSStefano Zampini tempGPU->begin()); 1150aa372e3fSPaul Mullowney 1151aa372e3fSPaul Mullowney /* Next, solve L */ 1152aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 11537656d835SStefano Zampini loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1154aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1155aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1156aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1157aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1158*4e4bbfaaSStefano Zampini tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1159aa372e3fSPaul Mullowney 1160aa372e3fSPaul Mullowney /* Then, solve U */ 1161aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 11627656d835SStefano Zampini upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1163aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1164aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1165aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1166aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1167*4e4bbfaaSStefano Zampini xarray, tempGPU->data().get());CHKERRCUDA(stat); 1168aa372e3fSPaul Mullowney 1169*4e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1170*4e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1171*4e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1172*4e4bbfaaSStefano Zampini xGPU); 11739ae82921SPaul Mullowney 1174c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1175c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1176c41cb2e2SAlejandro Lamas Daviña ierr = WaitForGPU();CHKERRCUDA(ierr); 1177661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1178958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 11799ae82921SPaul Mullowney PetscFunctionReturn(0); 11809ae82921SPaul Mullowney } 11819ae82921SPaul Mullowney 11826fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 11839ae82921SPaul Mullowney { 1184465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1185465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 11869ae82921SPaul Mullowney cusparseStatus_t stat; 11879ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1188aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1189aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1190aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1191b175d8bbSPaul Mullowney PetscErrorCode ierr; 11929ae82921SPaul Mullowney 11939ae82921SPaul Mullowney PetscFunctionBegin; 1194e057df02SPaul Mullowney /* Get the GPU pointers */ 1195c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1196c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 11979ae82921SPaul Mullowney 11987a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1199aa372e3fSPaul Mullowney /* First, solve L */ 1200aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 12017656d835SStefano Zampini loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1202aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1203aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1204aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1205aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1206c41cb2e2SAlejandro Lamas Daviña barray, tempGPU->data().get());CHKERRCUDA(stat); 1207aa372e3fSPaul Mullowney 1208aa372e3fSPaul Mullowney /* Next, solve U */ 1209aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 12107656d835SStefano Zampini upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1211aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1212aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1213aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1214aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1215c41cb2e2SAlejandro Lamas Daviña tempGPU->data().get(), xarray);CHKERRCUDA(stat); 12169ae82921SPaul Mullowney 1217c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1218c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1219c41cb2e2SAlejandro Lamas Daviña ierr = WaitForGPU();CHKERRCUDA(ierr); 1220661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1221958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 12229ae82921SPaul Mullowney PetscFunctionReturn(0); 12239ae82921SPaul Mullowney } 12249ae82921SPaul Mullowney 12256fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 12269ae82921SPaul Mullowney { 1227aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1228aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 12299ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 12309ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 12319ae82921SPaul Mullowney PetscErrorCode ierr; 1232aa372e3fSPaul Mullowney cusparseStatus_t stat; 1233b06137fdSPaul Mullowney cudaError_t err; 12349ae82921SPaul Mullowney 12359ae82921SPaul Mullowney PetscFunctionBegin; 1236b8ced49eSKarl Rupp if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 12379ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 123834d6c7a5SJose E. Roman if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 123934d6c7a5SJose E. Roman CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 124034d6c7a5SJose E. Roman /* copy values only */ 124134d6c7a5SJose E. Roman matrix->values->assign(a->a, a->a+a->nz); 12424863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 124334d6c7a5SJose E. Roman } else { 1244470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 12459ae82921SPaul Mullowney try { 1246aa372e3fSPaul Mullowney cusparsestruct->nonzerorow=0; 1247aa372e3fSPaul Mullowney for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 12489ae82921SPaul Mullowney 12499ae82921SPaul Mullowney if (a->compressedrow.use) { 12509ae82921SPaul Mullowney m = a->compressedrow.nrows; 12519ae82921SPaul Mullowney ii = a->compressedrow.i; 12529ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 12539ae82921SPaul Mullowney } else { 1254b06137fdSPaul Mullowney /* Forcing compressed row on the GPU */ 12559ae82921SPaul Mullowney int k=0; 1256854ce69bSBarry Smith ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1257854ce69bSBarry Smith ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 12589ae82921SPaul Mullowney ii[0]=0; 12599ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 12609ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 12619ae82921SPaul Mullowney ii[k] = a->i[j]; 12629ae82921SPaul Mullowney ridx[k]= j; 12639ae82921SPaul Mullowney k++; 12649ae82921SPaul Mullowney } 12659ae82921SPaul Mullowney } 1266aa372e3fSPaul Mullowney ii[cusparsestruct->nonzerorow] = a->nz; 1267aa372e3fSPaul Mullowney m = cusparsestruct->nonzerorow; 12689ae82921SPaul Mullowney } 12699ae82921SPaul Mullowney 1270aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 1271aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1272c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1273c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1274c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 12759ae82921SPaul Mullowney 1276c41cb2e2SAlejandro Lamas Daviña err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 12777656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12787656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 12797656d835SStefano Zampini err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12807656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12817656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1282c41cb2e2SAlejandro Lamas Daviña stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1283b06137fdSPaul Mullowney 1284aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1285aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1286aa372e3fSPaul Mullowney /* set the matrix */ 1287aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1288a65300a6SPaul Mullowney matrix->num_rows = m; 1289aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1290aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1291a65300a6SPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1292a65300a6SPaul Mullowney matrix->row_offsets->assign(ii, ii + m+1); 12939ae82921SPaul Mullowney 1294aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1295aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1296aa372e3fSPaul Mullowney 1297aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1298aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1299aa372e3fSPaul Mullowney 1300aa372e3fSPaul Mullowney /* assign the pointer */ 1301aa372e3fSPaul Mullowney matstruct->mat = matrix; 1302aa372e3fSPaul Mullowney 1303aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1304aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1305a65300a6SPaul Mullowney matrix->num_rows = m; 1306aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1307aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1308a65300a6SPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1309a65300a6SPaul Mullowney matrix->row_offsets->assign(ii, ii + m+1); 1310aa372e3fSPaul Mullowney 1311aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1312aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1313aa372e3fSPaul Mullowney 1314aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1315aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1316aa372e3fSPaul Mullowney 1317aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 1318c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1319aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1320aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1321a65300a6SPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1322aa372e3fSPaul Mullowney matstruct->descr, matrix->values->data().get(), 1323aa372e3fSPaul Mullowney matrix->row_offsets->data().get(), 1324aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1325c41cb2e2SAlejandro Lamas Daviña hybMat, 0, partition);CHKERRCUDA(stat); 1326aa372e3fSPaul Mullowney /* assign the pointer */ 1327aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1328aa372e3fSPaul Mullowney 1329aa372e3fSPaul Mullowney if (matrix) { 1330aa372e3fSPaul Mullowney if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1331aa372e3fSPaul Mullowney if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1332aa372e3fSPaul Mullowney if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1333aa372e3fSPaul Mullowney delete (CsrMatrix*)matrix; 1334087f3262SPaul Mullowney } 1335087f3262SPaul Mullowney } 1336ca45077fSPaul Mullowney 1337aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1338aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1339aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 13404863603aSSatish Balay ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1341aa372e3fSPaul Mullowney 1342aa372e3fSPaul Mullowney /* assign the pointer */ 1343aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 1344aa372e3fSPaul Mullowney 13459ae82921SPaul Mullowney if (!a->compressedrow.use) { 13469ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 13479ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 13489ae82921SPaul Mullowney } 1349e65717acSKarl Rupp cusparsestruct->workVector = new THRUSTARRAY(m); 13509ae82921SPaul Mullowney } catch(char *ex) { 13519ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 13529ae82921SPaul Mullowney } 135334d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 135434d6c7a5SJose E. Roman } 13557d0e52d8SJose E. Roman ierr = WaitForGPU();CHKERRCUDA(ierr); 1356b8ced49eSKarl Rupp A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 13579ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 13589ae82921SPaul Mullowney } 13599ae82921SPaul Mullowney PetscFunctionReturn(0); 13609ae82921SPaul Mullowney } 13619ae82921SPaul Mullowney 1362c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1363aa372e3fSPaul Mullowney { 1364aa372e3fSPaul Mullowney template <typename Tuple> 1365aa372e3fSPaul Mullowney __host__ __device__ 1366aa372e3fSPaul Mullowney void operator()(Tuple t) 1367aa372e3fSPaul Mullowney { 1368aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1369aa372e3fSPaul Mullowney } 1370aa372e3fSPaul Mullowney }; 1371aa372e3fSPaul Mullowney 13726fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 13739ae82921SPaul Mullowney { 1374b175d8bbSPaul Mullowney PetscErrorCode ierr; 13759ae82921SPaul Mullowney 13769ae82921SPaul Mullowney PetscFunctionBegin; 13777656d835SStefano Zampini ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 13789ae82921SPaul Mullowney PetscFunctionReturn(0); 13799ae82921SPaul Mullowney } 13809ae82921SPaul Mullowney 13816fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1382ca45077fSPaul Mullowney { 1383ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1384aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 13859ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1386465f34aeSAlejandro Lamas Daviña const PetscScalar *xarray; 1387465f34aeSAlejandro Lamas Daviña PetscScalar *yarray; 1388b175d8bbSPaul Mullowney PetscErrorCode ierr; 1389aa372e3fSPaul Mullowney cusparseStatus_t stat; 1390ca45077fSPaul Mullowney 1391ca45077fSPaul Mullowney PetscFunctionBegin; 139234d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 139334d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 13949ff858a8SKarl Rupp matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1395aa372e3fSPaul Mullowney if (!matstructT) { 1396bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1397aa372e3fSPaul Mullowney matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1398bda325fcSPaul Mullowney } 1399c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1400c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1401aa372e3fSPaul Mullowney 14027a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1403aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1404aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1405aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1406aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, 1407b06137fdSPaul Mullowney mat->num_entries, matstructT->alpha, matstructT->descr, 1408aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 14097656d835SStefano Zampini mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1410c41cb2e2SAlejandro Lamas Daviña yarray);CHKERRCUDA(stat); 1411aa372e3fSPaul Mullowney } else { 1412aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1413aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1414b06137fdSPaul Mullowney matstructT->alpha, matstructT->descr, hybMat, 14157656d835SStefano Zampini xarray, matstructT->beta_zero, 1416c41cb2e2SAlejandro Lamas Daviña yarray);CHKERRCUDA(stat); 1417ca45077fSPaul Mullowney } 1418c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1419c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1420c41cb2e2SAlejandro Lamas Daviña ierr = WaitForGPU();CHKERRCUDA(ierr); 1421661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1422958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1423ca45077fSPaul Mullowney PetscFunctionReturn(0); 1424ca45077fSPaul Mullowney } 1425ca45077fSPaul Mullowney 1426aa372e3fSPaul Mullowney 14276fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 14289ae82921SPaul Mullowney { 14299ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1430aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 14319ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1432465f34aeSAlejandro Lamas Daviña const PetscScalar *xarray; 14337656d835SStefano Zampini PetscScalar *zarray,*dptr,*beta; 1434b175d8bbSPaul Mullowney PetscErrorCode ierr; 1435aa372e3fSPaul Mullowney cusparseStatus_t stat; 1436c1fb3f03SStefano Zampini PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 14376e111a19SKarl Rupp 14389ae82921SPaul Mullowney PetscFunctionBegin; 143934d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 144034d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 14419ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 14429ae82921SPaul Mullowney try { 1443c1fb3f03SStefano Zampini cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1444c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1445c1fb3f03SStefano Zampini if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1446f2d70e9dSBarry Smith ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1447c1fb3f03SStefano Zampini } else { 1448c1fb3f03SStefano Zampini ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1449c1fb3f03SStefano Zampini } 1450c1fb3f03SStefano Zampini dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 14517656d835SStefano Zampini beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 14529ae82921SPaul Mullowney 14537a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 14547656d835SStefano Zampini /* csr_spmv is multiply add */ 1455aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1456b06137fdSPaul Mullowney /* here we need to be careful to set the number of rows in the multiply to the 1457b06137fdSPaul Mullowney number of compressed rows in the matrix ... which is equivalent to the 1458b06137fdSPaul Mullowney size of the workVector */ 14597656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1460aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1461a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 1462b06137fdSPaul Mullowney mat->num_entries, matstruct->alpha, matstruct->descr, 1463aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 14647656d835SStefano Zampini mat->column_indices->data().get(), xarray, beta, 14657656d835SStefano Zampini dptr);CHKERRCUDA(stat); 1466aa372e3fSPaul Mullowney } else { 1467a65300a6SPaul Mullowney if (cusparsestruct->workVector->size()) { 1468301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1469aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1470b06137fdSPaul Mullowney matstruct->alpha, matstruct->descr, hybMat, 14717656d835SStefano Zampini xarray, beta, 14727656d835SStefano Zampini dptr);CHKERRCUDA(stat); 1473a65300a6SPaul Mullowney } 1474aa372e3fSPaul Mullowney } 1475958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1476aa372e3fSPaul Mullowney 14777656d835SStefano Zampini if (yy) { 14787656d835SStefano Zampini if (dptr != zarray) { 14797656d835SStefano Zampini ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 14807656d835SStefano Zampini } else if (zz != yy) { 14817656d835SStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 14827656d835SStefano Zampini } 14837656d835SStefano Zampini } else if (dptr != zarray) { 1484c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 14857656d835SStefano Zampini } 1486aa372e3fSPaul Mullowney /* scatter the data from the temporary into the full vector with a += operation */ 14877a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 14887656d835SStefano Zampini if (dptr != zarray) { 14897656d835SStefano Zampini thrust::device_ptr<PetscScalar> zptr; 14907656d835SStefano Zampini 14917656d835SStefano Zampini zptr = thrust::device_pointer_cast(zarray); 1492c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1493c41cb2e2SAlejandro Lamas Daviña thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1494c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 14957656d835SStefano Zampini } 1496958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1497c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1498c1fb3f03SStefano Zampini if (yy && !cmpr) { 1499f2d70e9dSBarry Smith ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1500c1fb3f03SStefano Zampini } else { 1501c1fb3f03SStefano Zampini ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1502c1fb3f03SStefano Zampini } 15039ae82921SPaul Mullowney } catch(char *ex) { 15049ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 15059ae82921SPaul Mullowney } 15067656d835SStefano Zampini if (!yy) { /* MatMult */ 15077656d835SStefano Zampini if (!cusparsestruct->stream) { 1508c41cb2e2SAlejandro Lamas Daviña ierr = WaitForGPU();CHKERRCUDA(ierr); 15097656d835SStefano Zampini } 15107656d835SStefano Zampini } 1511958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 15129ae82921SPaul Mullowney PetscFunctionReturn(0); 15139ae82921SPaul Mullowney } 15149ae82921SPaul Mullowney 15156fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1516ca45077fSPaul Mullowney { 1517ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1518aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 15199ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1520465f34aeSAlejandro Lamas Daviña const PetscScalar *xarray; 1521a3fdcf43SKarl Rupp PetscScalar *zarray,*dptr,*beta; 1522b175d8bbSPaul Mullowney PetscErrorCode ierr; 1523aa372e3fSPaul Mullowney cusparseStatus_t stat; 15246e111a19SKarl Rupp 1525ca45077fSPaul Mullowney PetscFunctionBegin; 152634d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 152734d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 15289ff858a8SKarl Rupp matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1529aa372e3fSPaul Mullowney if (!matstructT) { 1530bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1531aa372e3fSPaul Mullowney matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1532bda325fcSPaul Mullowney } 1533aa372e3fSPaul Mullowney 1534ca45077fSPaul Mullowney try { 1535c41cb2e2SAlejandro Lamas Daviña ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1536c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1537f2d70e9dSBarry Smith ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1538a3fdcf43SKarl Rupp dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1539a3fdcf43SKarl Rupp beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1540ca45077fSPaul Mullowney 15417a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1542e057df02SPaul Mullowney /* multiply add with matrix transpose */ 1543aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1544aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1545b06137fdSPaul Mullowney /* here we need to be careful to set the number of rows in the multiply to the 1546b06137fdSPaul Mullowney number of compressed rows in the matrix ... which is equivalent to the 1547b06137fdSPaul Mullowney size of the workVector */ 1548aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1549a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 1550b06137fdSPaul Mullowney mat->num_entries, matstructT->alpha, matstructT->descr, 1551aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1552a3fdcf43SKarl Rupp mat->column_indices->data().get(), xarray, beta, 1553a3fdcf43SKarl Rupp dptr);CHKERRCUDA(stat); 1554aa372e3fSPaul Mullowney } else { 1555aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1556a65300a6SPaul Mullowney if (cusparsestruct->workVector->size()) { 1557aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1558b06137fdSPaul Mullowney matstructT->alpha, matstructT->descr, hybMat, 1559a3fdcf43SKarl Rupp xarray, beta, 1560a3fdcf43SKarl Rupp dptr);CHKERRCUDA(stat); 1561a65300a6SPaul Mullowney } 1562aa372e3fSPaul Mullowney } 1563a3fdcf43SKarl Rupp ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1564a3fdcf43SKarl Rupp 1565a3fdcf43SKarl Rupp if (dptr != zarray) { 1566a3fdcf43SKarl Rupp ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1567a3fdcf43SKarl Rupp } else if (zz != yy) { 1568a3fdcf43SKarl Rupp ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1569a3fdcf43SKarl Rupp } 1570a3fdcf43SKarl Rupp /* scatter the data from the temporary into the full vector with a += operation */ 1571a3fdcf43SKarl Rupp ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1572a3fdcf43SKarl Rupp if (dptr != zarray) { 1573a3fdcf43SKarl Rupp thrust::device_ptr<PetscScalar> zptr; 1574a3fdcf43SKarl Rupp 1575a3fdcf43SKarl Rupp zptr = thrust::device_pointer_cast(zarray); 1576aa372e3fSPaul Mullowney 1577aa372e3fSPaul Mullowney /* scatter the data from the temporary into the full vector with a += operation */ 1578c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1579554b8892SKarl Rupp thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1580c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 1581a3fdcf43SKarl Rupp } 1582a3fdcf43SKarl Rupp ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1583c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1584f2d70e9dSBarry Smith ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1585ca45077fSPaul Mullowney } catch(char *ex) { 1586ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1587661c2d29Shannah_mairs } 1588a3fdcf43SKarl Rupp ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1589958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1590ca45077fSPaul Mullowney PetscFunctionReturn(0); 1591ca45077fSPaul Mullowney } 1592ca45077fSPaul Mullowney 15936fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 15949ae82921SPaul Mullowney { 15959ae82921SPaul Mullowney PetscErrorCode ierr; 15966e111a19SKarl Rupp 15979ae82921SPaul Mullowney PetscFunctionBegin; 15989ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1599489de41dSStefano Zampini if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1600bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1601e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1602bc3f50f2SPaul Mullowney } 1603bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 1604bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1605bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1606bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 16079ae82921SPaul Mullowney PetscFunctionReturn(0); 16089ae82921SPaul Mullowney } 16099ae82921SPaul Mullowney 16109ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 1611e057df02SPaul Mullowney /*@ 16129ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1613e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 1614e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1615e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 1616e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 1617e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 16189ae82921SPaul Mullowney 1619d083f849SBarry Smith Collective 16209ae82921SPaul Mullowney 16219ae82921SPaul Mullowney Input Parameters: 16229ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 16239ae82921SPaul Mullowney . m - number of rows 16249ae82921SPaul Mullowney . n - number of columns 16259ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 16269ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 16270298fd71SBarry Smith (possibly different for each row) or NULL 16289ae82921SPaul Mullowney 16299ae82921SPaul Mullowney Output Parameter: 16309ae82921SPaul Mullowney . A - the matrix 16319ae82921SPaul Mullowney 16329ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 16339ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 16349ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 16359ae82921SPaul Mullowney 16369ae82921SPaul Mullowney Notes: 16379ae82921SPaul Mullowney If nnz is given then nz is ignored 16389ae82921SPaul Mullowney 16399ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 16409ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 16419ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 16429ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 16439ae82921SPaul Mullowney 16449ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 16450298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 16469ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 16479ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 16489ae82921SPaul Mullowney 16499ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 16509ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 16519ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 16529ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 16539ae82921SPaul Mullowney 16549ae82921SPaul Mullowney Level: intermediate 16559ae82921SPaul Mullowney 1656e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 16579ae82921SPaul Mullowney @*/ 16589ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 16599ae82921SPaul Mullowney { 16609ae82921SPaul Mullowney PetscErrorCode ierr; 16619ae82921SPaul Mullowney 16629ae82921SPaul Mullowney PetscFunctionBegin; 16639ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 16649ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 16659ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 16669ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 16679ae82921SPaul Mullowney PetscFunctionReturn(0); 16689ae82921SPaul Mullowney } 16699ae82921SPaul Mullowney 16706fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 16719ae82921SPaul Mullowney { 16729ae82921SPaul Mullowney PetscErrorCode ierr; 1673ab25e6cbSDominic Meiser 16749ae82921SPaul Mullowney PetscFunctionBegin; 16759ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 1676b8ced49eSKarl Rupp if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1677470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 16789ae82921SPaul Mullowney } 16799ae82921SPaul Mullowney } else { 1680470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1681aa372e3fSPaul Mullowney } 16829ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 16839ae82921SPaul Mullowney PetscFunctionReturn(0); 16849ae82921SPaul Mullowney } 16859ae82921SPaul Mullowney 16869ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 16879ff858a8SKarl Rupp { 16889ff858a8SKarl Rupp PetscErrorCode ierr; 16899ff858a8SKarl Rupp Mat C; 16909ff858a8SKarl Rupp cusparseStatus_t stat; 16919ff858a8SKarl Rupp cusparseHandle_t handle=0; 16929ff858a8SKarl Rupp 16939ff858a8SKarl Rupp PetscFunctionBegin; 16949ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 16959ff858a8SKarl Rupp C = *B; 169634136279SStefano Zampini ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 169734136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 169834136279SStefano Zampini 16999ff858a8SKarl Rupp /* inject CUSPARSE-specific stuff */ 17009ff858a8SKarl Rupp if (C->factortype==MAT_FACTOR_NONE) { 17019ff858a8SKarl Rupp /* you cannot check the inode.use flag here since the matrix was just created. 17029ff858a8SKarl Rupp now build a GPU matrix data structure */ 17039ff858a8SKarl Rupp C->spptr = new Mat_SeqAIJCUSPARSE; 17049ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 17059ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 17069ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 17079ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 17089ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 17099ff858a8SKarl Rupp stat = cusparseCreate(&handle);CHKERRCUDA(stat); 17109ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 17119ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 17129ff858a8SKarl Rupp } else { 17139ff858a8SKarl Rupp /* NEXT, set the pointers to the triangular factors */ 17149ff858a8SKarl Rupp C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 17159ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 17169ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 17179ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 17189ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 17199ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 17209ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 17219ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 17229ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 17239ff858a8SKarl Rupp stat = cusparseCreate(&handle);CHKERRCUDA(stat); 17249ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 17259ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 17269ff858a8SKarl Rupp } 17279ff858a8SKarl Rupp 17289ff858a8SKarl Rupp C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 17299ff858a8SKarl Rupp C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 17309ff858a8SKarl Rupp C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 17319ff858a8SKarl Rupp C->ops->mult = MatMult_SeqAIJCUSPARSE; 17329ff858a8SKarl Rupp C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 17339ff858a8SKarl Rupp C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 17349ff858a8SKarl Rupp C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 17359ff858a8SKarl Rupp C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 17369ff858a8SKarl Rupp 17379ff858a8SKarl Rupp ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 17389ff858a8SKarl Rupp 1739b8ced49eSKarl Rupp C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 17409ff858a8SKarl Rupp 17419ff858a8SKarl Rupp ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 17429ff858a8SKarl Rupp PetscFunctionReturn(0); 17439ff858a8SKarl Rupp } 17449ff858a8SKarl Rupp 174502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 17469ae82921SPaul Mullowney { 17479ae82921SPaul Mullowney PetscErrorCode ierr; 1748aa372e3fSPaul Mullowney cusparseStatus_t stat; 1749aa372e3fSPaul Mullowney cusparseHandle_t handle=0; 17509ae82921SPaul Mullowney 17519ae82921SPaul Mullowney PetscFunctionBegin; 175234136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 175334136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 175434136279SStefano Zampini 17559ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 1756e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 1757e057df02SPaul Mullowney now build a GPU matrix data structure */ 17589ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 17599ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1760aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1761aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1762e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1763aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1764c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1765aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 17669ff858a8SKarl Rupp ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 17679ae82921SPaul Mullowney } else { 17689ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 1769debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 17709ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 17719ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1772aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1773aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1774aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1775aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1776aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1777c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1778aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1779aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 17809ae82921SPaul Mullowney } 1781aa372e3fSPaul Mullowney 17829ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 17839ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 17849ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1785ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 1786ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1787ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1788ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 17899ff858a8SKarl Rupp B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 17902205254eSKarl Rupp 17919ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 17922205254eSKarl Rupp 1793b8ced49eSKarl Rupp B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 17942205254eSKarl Rupp 1795bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 17969ae82921SPaul Mullowney PetscFunctionReturn(0); 17979ae82921SPaul Mullowney } 17989ae82921SPaul Mullowney 179902fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 180002fe1965SBarry Smith { 180102fe1965SBarry Smith PetscErrorCode ierr; 180202fe1965SBarry Smith 180302fe1965SBarry Smith PetscFunctionBegin; 180402fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1805489de41dSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 180602fe1965SBarry Smith PetscFunctionReturn(0); 180702fe1965SBarry Smith } 180802fe1965SBarry Smith 18093ca39a21SBarry Smith /*MC 1810e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1811e057df02SPaul Mullowney 1812e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 18132692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 18142692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1815e057df02SPaul Mullowney 1816e057df02SPaul Mullowney Options Database Keys: 1817e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1818aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1819a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1820e057df02SPaul Mullowney 1821e057df02SPaul Mullowney Level: beginner 1822e057df02SPaul Mullowney 18238468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1824e057df02SPaul Mullowney M*/ 18257f756511SDominic Meiser 182642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 182742c9c57cSBarry Smith 18280f39cd5aSBarry Smith 18293ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 183042c9c57cSBarry Smith { 183142c9c57cSBarry Smith PetscErrorCode ierr; 183242c9c57cSBarry Smith 183342c9c57cSBarry Smith PetscFunctionBegin; 18343ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 18353ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 18363ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 18373ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 183842c9c57cSBarry Smith PetscFunctionReturn(0); 183942c9c57cSBarry Smith } 184029b38603SBarry Smith 184181e08676SBarry Smith 1842470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 18437f756511SDominic Meiser { 18447f756511SDominic Meiser cusparseStatus_t stat; 18457f756511SDominic Meiser cusparseHandle_t handle; 18467f756511SDominic Meiser 18477f756511SDominic Meiser PetscFunctionBegin; 18487f756511SDominic Meiser if (*cusparsestruct) { 1849470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1850470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 18517f756511SDominic Meiser delete (*cusparsestruct)->workVector; 18527f756511SDominic Meiser if (handle = (*cusparsestruct)->handle) { 1853c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(handle);CHKERRCUDA(stat); 18547f756511SDominic Meiser } 18557f756511SDominic Meiser delete *cusparsestruct; 18567f756511SDominic Meiser *cusparsestruct = 0; 18577f756511SDominic Meiser } 18587f756511SDominic Meiser PetscFunctionReturn(0); 18597f756511SDominic Meiser } 18607f756511SDominic Meiser 18617f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 18627f756511SDominic Meiser { 18637f756511SDominic Meiser PetscFunctionBegin; 18647f756511SDominic Meiser if (*mat) { 18657f756511SDominic Meiser delete (*mat)->values; 18667f756511SDominic Meiser delete (*mat)->column_indices; 18677f756511SDominic Meiser delete (*mat)->row_offsets; 18687f756511SDominic Meiser delete *mat; 18697f756511SDominic Meiser *mat = 0; 18707f756511SDominic Meiser } 18717f756511SDominic Meiser PetscFunctionReturn(0); 18727f756511SDominic Meiser } 18737f756511SDominic Meiser 1874470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 18757f756511SDominic Meiser { 18767f756511SDominic Meiser cusparseStatus_t stat; 18777f756511SDominic Meiser PetscErrorCode ierr; 18787f756511SDominic Meiser 18797f756511SDominic Meiser PetscFunctionBegin; 18807f756511SDominic Meiser if (*trifactor) { 1881c41cb2e2SAlejandro Lamas Daviña if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1882c41cb2e2SAlejandro Lamas Daviña if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 18837f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 18847f756511SDominic Meiser delete *trifactor; 18857f756511SDominic Meiser *trifactor = 0; 18867f756511SDominic Meiser } 18877f756511SDominic Meiser PetscFunctionReturn(0); 18887f756511SDominic Meiser } 18897f756511SDominic Meiser 1890470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 18917f756511SDominic Meiser { 18927f756511SDominic Meiser CsrMatrix *mat; 18937f756511SDominic Meiser cusparseStatus_t stat; 18947f756511SDominic Meiser cudaError_t err; 18957f756511SDominic Meiser 18967f756511SDominic Meiser PetscFunctionBegin; 18977f756511SDominic Meiser if (*matstruct) { 18987f756511SDominic Meiser if ((*matstruct)->mat) { 18997f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 19007f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1901c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 19027f756511SDominic Meiser } else { 19037f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 19047f756511SDominic Meiser CsrMatrix_Destroy(&mat); 19057f756511SDominic Meiser } 19067f756511SDominic Meiser } 1907c41cb2e2SAlejandro Lamas Daviña if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 19087f756511SDominic Meiser delete (*matstruct)->cprowIndices; 1909c41cb2e2SAlejandro Lamas Daviña if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 19107656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 19117656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 19127f756511SDominic Meiser delete *matstruct; 19137f756511SDominic Meiser *matstruct = 0; 19147f756511SDominic Meiser } 19157f756511SDominic Meiser PetscFunctionReturn(0); 19167f756511SDominic Meiser } 19177f756511SDominic Meiser 1918470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 19197f756511SDominic Meiser { 19207f756511SDominic Meiser cusparseHandle_t handle; 19217f756511SDominic Meiser cusparseStatus_t stat; 19227f756511SDominic Meiser 19237f756511SDominic Meiser PetscFunctionBegin; 19247f756511SDominic Meiser if (*trifactors) { 1925470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1926470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1927470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1928470880abSPatrick Sanan MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 19297f756511SDominic Meiser delete (*trifactors)->rpermIndices; 19307f756511SDominic Meiser delete (*trifactors)->cpermIndices; 19317f756511SDominic Meiser delete (*trifactors)->workVector; 19327f756511SDominic Meiser if (handle = (*trifactors)->handle) { 1933c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(handle);CHKERRCUDA(stat); 19347f756511SDominic Meiser } 19357f756511SDominic Meiser delete *trifactors; 19367f756511SDominic Meiser *trifactors = 0; 19377f756511SDominic Meiser } 19387f756511SDominic Meiser PetscFunctionReturn(0); 19397f756511SDominic Meiser } 19407f756511SDominic Meiser 1941