19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 39ae82921SPaul Mullowney matrix storage format. 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 69ae82921SPaul Mullowney #include "petscconf.h" 79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN 89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 10087f3262SPaul Mullowney 119ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 129ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 139ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END 149ae82921SPaul Mullowney #undef VecType 159ae82921SPaul Mullowney #include "cusparsematimpl.h" 16e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 179ae82921SPaul Mullowney 18e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable 19e057df02SPaul Mullowney from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 20e057df02SPaul Mullowney SpMV. Essentially, I need to use the same stream variable in two different 21e057df02SPaul Mullowney data structures. I do this by creating a single instance of that stream 22e057df02SPaul Mullowney and reuse it. */ 23ca45077fSPaul Mullowney cudaStream_t theBodyStream=0; 249ae82921SPaul Mullowney 25087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 26087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 27087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 28087f3262SPaul Mullowney 299ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 309ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 319ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 32087f3262SPaul Mullowney 339ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 349ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 35bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 36bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 379ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 388dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 398dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 408dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 418dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 429ae82921SPaul Mullowney 439ae82921SPaul Mullowney #undef __FUNCT__ 449ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 459ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 469ae82921SPaul Mullowney { 479ae82921SPaul Mullowney PetscFunctionBegin; 489ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 499ae82921SPaul Mullowney PetscFunctionReturn(0); 509ae82921SPaul Mullowney } 519ae82921SPaul Mullowney 5274c3e148SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 53b2573a8aSBarry Smith 54087f3262SPaul Mullowney 55c708e6cdSJed Brown /*MC 56087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 57087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 58087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 59087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 60087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 61087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 62c708e6cdSJed Brown 63c708e6cdSJed Brown ./configure --download-txpetscgpu to install PETSc to use CUSPARSE 64c708e6cdSJed Brown 65087f3262SPaul Mullowney Consult CUSPARSE documentation for more information about the matrix storage formats 66087f3262SPaul Mullowney which correspond to the options database keys below. 67c708e6cdSJed Brown 68c708e6cdSJed Brown Options Database Keys: 69087f3262SPaul Mullowney . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 709ae82921SPaul Mullowney 719ae82921SPaul Mullowney Level: beginner 72c708e6cdSJed Brown 73c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 74c708e6cdSJed Brown M*/ 759ae82921SPaul Mullowney 769ae82921SPaul Mullowney #undef __FUNCT__ 779ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 783c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 799ae82921SPaul Mullowney { 809ae82921SPaul Mullowney PetscErrorCode ierr; 819ae82921SPaul Mullowney 829ae82921SPaul Mullowney PetscFunctionBegin; 839ae82921SPaul Mullowney ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 849ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 859ae82921SPaul Mullowney ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 86087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 879ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 889ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 89087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 90087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 91087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 929ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 93087f3262SPaul Mullowney ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 949ae82921SPaul Mullowney (*B)->factortype = ftype; 959ae82921SPaul Mullowney PetscFunctionReturn(0); 969ae82921SPaul Mullowney } 979ae82921SPaul Mullowney 989ae82921SPaul Mullowney #undef __FUNCT__ 99e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 100e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 101ca45077fSPaul Mullowney { 102ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 1036e111a19SKarl Rupp 104ca45077fSPaul Mullowney PetscFunctionBegin; 105ca45077fSPaul Mullowney switch (op) { 106e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 107e057df02SPaul Mullowney cusparseMat->format = format; 108ca45077fSPaul Mullowney break; 109e057df02SPaul Mullowney case MAT_CUSPARSE_SOLVE: 110e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 111ca45077fSPaul Mullowney break; 112e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 113e057df02SPaul Mullowney cusparseMat->format = format; 114e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 115ca45077fSPaul Mullowney break; 116ca45077fSPaul Mullowney default: 117e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 118ca45077fSPaul Mullowney } 119ca45077fSPaul Mullowney PetscFunctionReturn(0); 120ca45077fSPaul Mullowney } 1219ae82921SPaul Mullowney 122e057df02SPaul Mullowney /*@ 123e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 124e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 1258468deeeSKarl Rupp for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 126e057df02SPaul Mullowney to build/install PETSc to use this package. 127e057df02SPaul Mullowney 128e057df02SPaul Mullowney Not Collective 129e057df02SPaul Mullowney 130e057df02SPaul Mullowney Input Parameters: 1318468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 1328468deeeSKarl Rupp . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1338468deeeSKarl Rupp - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 134e057df02SPaul Mullowney 135e057df02SPaul Mullowney Output Parameter: 136e057df02SPaul Mullowney 137e057df02SPaul Mullowney Level: intermediate 138e057df02SPaul Mullowney 1398468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 140e057df02SPaul Mullowney @*/ 141e057df02SPaul Mullowney #undef __FUNCT__ 142e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat" 143e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 144e057df02SPaul Mullowney { 145e057df02SPaul Mullowney PetscErrorCode ierr; 1466e111a19SKarl Rupp 147e057df02SPaul Mullowney PetscFunctionBegin; 148e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 149e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 150e057df02SPaul Mullowney PetscFunctionReturn(0); 151e057df02SPaul Mullowney } 152e057df02SPaul Mullowney 1539ae82921SPaul Mullowney #undef __FUNCT__ 1549ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1559ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1569ae82921SPaul Mullowney { 1579ae82921SPaul Mullowney PetscErrorCode ierr; 158e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1599ae82921SPaul Mullowney PetscBool flg; 1606e111a19SKarl Rupp 1619ae82921SPaul Mullowney PetscFunctionBegin; 162e057df02SPaul Mullowney ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 163e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 1649ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 165e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 1667083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 167e057df02SPaul Mullowney if (flg) { 168e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 169045c96e1SPaul Mullowney } 1702205254eSKarl Rupp } else { 171e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 1727083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173e057df02SPaul Mullowney if (flg) { 174e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 175e057df02SPaul Mullowney } 1769ae82921SPaul Mullowney } 1774c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 1787083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 1794c87dfd4SPaul Mullowney if (flg) { 1804c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 1814c87dfd4SPaul Mullowney } 1829ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 1839ae82921SPaul Mullowney PetscFunctionReturn(0); 1849ae82921SPaul Mullowney 1859ae82921SPaul Mullowney } 1869ae82921SPaul Mullowney 1879ae82921SPaul Mullowney #undef __FUNCT__ 1889ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 1899ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1909ae82921SPaul Mullowney { 1919ae82921SPaul Mullowney PetscErrorCode ierr; 1929ae82921SPaul Mullowney 1939ae82921SPaul Mullowney PetscFunctionBegin; 1949ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1959ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1969ae82921SPaul Mullowney PetscFunctionReturn(0); 1979ae82921SPaul Mullowney } 1989ae82921SPaul Mullowney 1999ae82921SPaul Mullowney #undef __FUNCT__ 2009ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 2019ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2029ae82921SPaul Mullowney { 2039ae82921SPaul Mullowney PetscErrorCode ierr; 2049ae82921SPaul Mullowney 2059ae82921SPaul Mullowney PetscFunctionBegin; 2069ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2079ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2089ae82921SPaul Mullowney PetscFunctionReturn(0); 2099ae82921SPaul Mullowney } 2109ae82921SPaul Mullowney 2119ae82921SPaul Mullowney #undef __FUNCT__ 212087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 213087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214087f3262SPaul Mullowney { 215087f3262SPaul Mullowney PetscErrorCode ierr; 216087f3262SPaul Mullowney 217087f3262SPaul Mullowney PetscFunctionBegin; 218087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220087f3262SPaul Mullowney PetscFunctionReturn(0); 221087f3262SPaul Mullowney } 222087f3262SPaul Mullowney 223087f3262SPaul Mullowney #undef __FUNCT__ 224087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 225087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 226087f3262SPaul Mullowney { 227087f3262SPaul Mullowney PetscErrorCode ierr; 228087f3262SPaul Mullowney 229087f3262SPaul Mullowney PetscFunctionBegin; 230087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 231087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 232087f3262SPaul Mullowney PetscFunctionReturn(0); 233087f3262SPaul Mullowney } 234087f3262SPaul Mullowney 235087f3262SPaul Mullowney #undef __FUNCT__ 236087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 237087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2389ae82921SPaul Mullowney { 2399ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2409ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2419ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2429ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 2439ae82921SPaul Mullowney cusparseStatus_t stat; 2449ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2459ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2469ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2479ae82921SPaul Mullowney PetscScalar *AALo; 2489ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 249*b175d8bbSPaul Mullowney PetscErrorCode ierr; 2509ae82921SPaul Mullowney 2519ae82921SPaul Mullowney PetscFunctionBegin; 2529ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2539ae82921SPaul Mullowney try { 2549ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2559ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2569ae82921SPaul Mullowney 2579ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2589ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2599ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2609ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2619ae82921SPaul Mullowney 2629ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2639ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2649ae82921SPaul Mullowney AiLo[n] = nzLower; 2659ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2669ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2679ae82921SPaul Mullowney v = aa; 2689ae82921SPaul Mullowney vi = aj; 2699ae82921SPaul Mullowney offset = 1; 2709ae82921SPaul Mullowney rowOffset= 1; 2719ae82921SPaul Mullowney for (i=1; i<n; i++) { 2729ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 273e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2749ae82921SPaul Mullowney AiLo[i] = rowOffset; 2759ae82921SPaul Mullowney rowOffset += nz+1; 2769ae82921SPaul Mullowney 2779ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 2789ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 2799ae82921SPaul Mullowney 2809ae82921SPaul Mullowney offset += nz; 2819ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 2829ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 2839ae82921SPaul Mullowney offset += 1; 2849ae82921SPaul Mullowney 2859ae82921SPaul Mullowney v += nz; 2869ae82921SPaul Mullowney vi += nz; 2879ae82921SPaul Mullowney } 288e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 2892205254eSKarl Rupp 290087f3262SPaul Mullowney stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 2919ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 2929ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 2932205254eSKarl Rupp 2949ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 2952205254eSKarl Rupp 2969ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 2979ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 2989ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 2999ae82921SPaul Mullowney } catch(char *ex) { 3009ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3019ae82921SPaul Mullowney } 3029ae82921SPaul Mullowney } 3039ae82921SPaul Mullowney PetscFunctionReturn(0); 3049ae82921SPaul Mullowney } 3059ae82921SPaul Mullowney 3069ae82921SPaul Mullowney #undef __FUNCT__ 307087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 308087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 3099ae82921SPaul Mullowney { 3109ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3119ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3129ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3139ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 3149ae82921SPaul Mullowney cusparseStatus_t stat; 3159ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 3169ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3179ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3189ae82921SPaul Mullowney PetscScalar *AAUp; 3199ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 3209ae82921SPaul Mullowney PetscErrorCode ierr; 3219ae82921SPaul Mullowney 3229ae82921SPaul Mullowney PetscFunctionBegin; 3239ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 3249ae82921SPaul Mullowney try { 3259ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3269ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 3279ae82921SPaul Mullowney 3289ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 3299ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 3309ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 3319ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 3329ae82921SPaul Mullowney 3339ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 3349ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 3359ae82921SPaul Mullowney AiUp[n]=nzUpper; 3369ae82921SPaul Mullowney offset = nzUpper; 3379ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 3389ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 3399ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 3409ae82921SPaul Mullowney 341e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 3429ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 3439ae82921SPaul Mullowney 344e057df02SPaul Mullowney /* decrement the offset */ 3459ae82921SPaul Mullowney offset -= (nz+1); 3469ae82921SPaul Mullowney 347e057df02SPaul Mullowney /* first, set the diagonal elements */ 3489ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 3499ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 3509ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 3519ae82921SPaul Mullowney 3529ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3539ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3549ae82921SPaul Mullowney } 355e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 3562205254eSKarl Rupp 357087f3262SPaul Mullowney stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 3589ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 3599ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 3602205254eSKarl Rupp 3619ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 3622205254eSKarl Rupp 3639ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 3649ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 3659ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 3669ae82921SPaul Mullowney } catch(char *ex) { 3679ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3689ae82921SPaul Mullowney } 3699ae82921SPaul Mullowney } 3709ae82921SPaul Mullowney PetscFunctionReturn(0); 3719ae82921SPaul Mullowney } 3729ae82921SPaul Mullowney 3739ae82921SPaul Mullowney #undef __FUNCT__ 374087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 375087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 3769ae82921SPaul Mullowney { 3779ae82921SPaul Mullowney PetscErrorCode ierr; 3789ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3799ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3809ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 3819ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3829ae82921SPaul Mullowney const PetscInt *r,*c; 3839ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3849ae82921SPaul Mullowney 3859ae82921SPaul Mullowney PetscFunctionBegin; 386087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 387087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 3882205254eSKarl Rupp 3899ae82921SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 3909ae82921SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 3919ae82921SPaul Mullowney 3929ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 393e057df02SPaul Mullowney /*lower triangular indices */ 3949ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3959ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3962205254eSKarl Rupp if (!row_identity) { 3979ae82921SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 3982205254eSKarl Rupp } 3999ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4009ae82921SPaul Mullowney 401e057df02SPaul Mullowney /*upper triangular indices */ 4029ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 4039ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4042205254eSKarl Rupp if (!col_identity) { 4059ae82921SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 4062205254eSKarl Rupp } 4079ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 4089ae82921SPaul Mullowney PetscFunctionReturn(0); 4099ae82921SPaul Mullowney } 4109ae82921SPaul Mullowney 4119ae82921SPaul Mullowney #undef __FUNCT__ 412087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 413087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 414087f3262SPaul Mullowney { 415087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 416087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 417087f3262SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 418087f3262SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 419087f3262SPaul Mullowney cusparseStatus_t stat; 420087f3262SPaul Mullowney PetscErrorCode ierr; 421087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 422087f3262SPaul Mullowney PetscScalar *AAUp; 423087f3262SPaul Mullowney PetscScalar *AALo; 424087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 425087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 426087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 427087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 428087f3262SPaul Mullowney 429087f3262SPaul Mullowney PetscFunctionBegin; 430087f3262SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 431087f3262SPaul Mullowney try { 432087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 433087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 434087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 435087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 436087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 437087f3262SPaul Mullowney 438087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 439087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 440087f3262SPaul Mullowney AiUp[n]=nzUpper; 441087f3262SPaul Mullowney offset = 0; 442087f3262SPaul Mullowney for (i=0; i<n; i++) { 443087f3262SPaul Mullowney /* set the pointers */ 444087f3262SPaul Mullowney v = aa + ai[i]; 445087f3262SPaul Mullowney vj = aj + ai[i]; 446087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 447087f3262SPaul Mullowney 448087f3262SPaul Mullowney /* first, set the diagonal elements */ 449087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 450087f3262SPaul Mullowney AAUp[offset] = 1.0/v[nz]; 451087f3262SPaul Mullowney AiUp[i] = offset; 452087f3262SPaul Mullowney AALo[offset] = 1.0/v[nz]; 453087f3262SPaul Mullowney 454087f3262SPaul Mullowney offset+=1; 455087f3262SPaul Mullowney if (nz>0) { 456087f3262SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 457087f3262SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 458087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 459087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 460087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 461087f3262SPaul Mullowney } 462087f3262SPaul Mullowney offset+=nz; 463087f3262SPaul Mullowney } 464087f3262SPaul Mullowney } 465087f3262SPaul Mullowney 466087f3262SPaul Mullowney /* Build the upper triangular piece */ 467087f3262SPaul Mullowney cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 468087f3262SPaul Mullowney stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 469087f3262SPaul Mullowney ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 470087f3262SPaul Mullowney stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat); 471087f3262SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp; 472087f3262SPaul Mullowney 473087f3262SPaul Mullowney /* Build the lower triangular piece */ 474087f3262SPaul Mullowney cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 475087f3262SPaul Mullowney stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 476087f3262SPaul Mullowney ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr); 477087f3262SPaul Mullowney stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat); 478087f3262SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo; 479087f3262SPaul Mullowney 480087f3262SPaul Mullowney /* set this flag ... for performance logging */ 481087f3262SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE; 482087f3262SPaul Mullowney 483087f3262SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 484087f3262SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 485087f3262SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 486087f3262SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 487087f3262SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 488087f3262SPaul Mullowney } catch(char *ex) { 489087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 490087f3262SPaul Mullowney } 491087f3262SPaul Mullowney } 492087f3262SPaul Mullowney PetscFunctionReturn(0); 493087f3262SPaul Mullowney } 494087f3262SPaul Mullowney 495087f3262SPaul Mullowney #undef __FUNCT__ 496087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 497087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 498087f3262SPaul Mullowney { 499087f3262SPaul Mullowney PetscErrorCode ierr; 500087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 501087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 502087f3262SPaul Mullowney IS ip = a->row; 503087f3262SPaul Mullowney const PetscInt *rip; 504087f3262SPaul Mullowney PetscBool perm_identity; 505087f3262SPaul Mullowney PetscInt n = A->rmap->n; 506087f3262SPaul Mullowney 507087f3262SPaul Mullowney PetscFunctionBegin; 508087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 509087f3262SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 510087f3262SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 511087f3262SPaul Mullowney /*lower triangular indices */ 512087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 513087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 514087f3262SPaul Mullowney if (!perm_identity) { 515087f3262SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); 516087f3262SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr); 517087f3262SPaul Mullowney } 518087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 519087f3262SPaul Mullowney PetscFunctionReturn(0); 520087f3262SPaul Mullowney } 521087f3262SPaul Mullowney 522087f3262SPaul Mullowney #undef __FUNCT__ 5239ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 5249ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 5259ae82921SPaul Mullowney { 5269ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 5279ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 5289ae82921SPaul Mullowney PetscBool row_identity,col_identity; 529*b175d8bbSPaul Mullowney PetscErrorCode ierr; 5309ae82921SPaul Mullowney 5319ae82921SPaul Mullowney PetscFunctionBegin; 5329ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 533e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 5349ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5359ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 536bda325fcSPaul Mullowney if (row_identity && col_identity) { 537bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 538bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 539bda325fcSPaul Mullowney } else { 540bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 541bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 542bda325fcSPaul Mullowney } 5438dc1d2a3SPaul Mullowney 544e057df02SPaul Mullowney /* get the triangular factors */ 545087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 5469ae82921SPaul Mullowney PetscFunctionReturn(0); 5479ae82921SPaul Mullowney } 5489ae82921SPaul Mullowney 549087f3262SPaul Mullowney #undef __FUNCT__ 550087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 551087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 552087f3262SPaul Mullowney { 553087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 554087f3262SPaul Mullowney IS ip = b->row; 555087f3262SPaul Mullowney PetscBool perm_identity; 556*b175d8bbSPaul Mullowney PetscErrorCode ierr; 557087f3262SPaul Mullowney 558087f3262SPaul Mullowney PetscFunctionBegin; 559087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 560087f3262SPaul Mullowney 561087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 562087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 563087f3262SPaul Mullowney if (perm_identity) { 564087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 565087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 566087f3262SPaul Mullowney } else { 567087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 568087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 569087f3262SPaul Mullowney } 570087f3262SPaul Mullowney 571087f3262SPaul Mullowney /* get the triangular factors */ 572087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 573087f3262SPaul Mullowney PetscFunctionReturn(0); 574087f3262SPaul Mullowney } 5759ae82921SPaul Mullowney 576bda325fcSPaul Mullowney #undef __FUNCT__ 577bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 578*b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 579bda325fcSPaul Mullowney { 580bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 581bda325fcSPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 582bda325fcSPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 583bda325fcSPaul Mullowney cusparseStatus_t stat; 584*b175d8bbSPaul Mullowney 585bda325fcSPaul Mullowney PetscFunctionBegin; 586087f3262SPaul Mullowney stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle, 587bda325fcSPaul Mullowney CUSPARSE_MATRIX_TYPE_TRIANGULAR, 588bda325fcSPaul Mullowney CUSPARSE_FILL_MODE_UPPER, 589087f3262SPaul Mullowney CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 590087f3262SPaul Mullowney stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat); 591bda325fcSPaul Mullowney 592087f3262SPaul Mullowney stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle, 593bda325fcSPaul Mullowney CUSPARSE_MATRIX_TYPE_TRIANGULAR, 594bda325fcSPaul Mullowney CUSPARSE_FILL_MODE_LOWER, 595087f3262SPaul Mullowney CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 596087f3262SPaul Mullowney stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat); 597bda325fcSPaul Mullowney PetscFunctionReturn(0); 598bda325fcSPaul Mullowney } 599bda325fcSPaul Mullowney 600bda325fcSPaul Mullowney #undef __FUNCT__ 601bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 602*b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 603bda325fcSPaul Mullowney { 604bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 605bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 606bda325fcSPaul Mullowney cusparseStatus_t stat; 607*b175d8bbSPaul Mullowney PetscErrorCode ierr; 608*b175d8bbSPaul Mullowney 609bda325fcSPaul Mullowney PetscFunctionBegin; 610087f3262SPaul Mullowney if (cusparseMat->isSymmOrHerm) { 611087f3262SPaul Mullowney stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 612087f3262SPaul Mullowney } else { 613087f3262SPaul Mullowney stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 614087f3262SPaul Mullowney } 615087f3262SPaul Mullowney ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr); 616bda325fcSPaul Mullowney PetscFunctionReturn(0); 617bda325fcSPaul Mullowney } 618bda325fcSPaul Mullowney 619bda325fcSPaul Mullowney #undef __FUNCT__ 620bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 621bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 622bda325fcSPaul Mullowney { 623bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624bda325fcSPaul Mullowney CUSPARRAY *xGPU, *bGPU; 625bda325fcSPaul Mullowney cusparseStatus_t stat; 626bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 627bda325fcSPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 628bda325fcSPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 629bda325fcSPaul Mullowney CUSPARRAY *tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 630*b175d8bbSPaul Mullowney PetscErrorCode ierr; 631bda325fcSPaul Mullowney 632bda325fcSPaul Mullowney PetscFunctionBegin; 633bda325fcSPaul Mullowney /* Analyze the matrix ... on the fly */ 634bda325fcSPaul Mullowney if (!cusparseTriFactors->hasTranspose) { 635bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 636bda325fcSPaul Mullowney cusparseTriFactors->hasTranspose=PETSC_TRUE; 637bda325fcSPaul Mullowney } 638bda325fcSPaul Mullowney 639bda325fcSPaul Mullowney /* Get the GPU pointers */ 640bda325fcSPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 641bda325fcSPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 642bda325fcSPaul Mullowney 643bda325fcSPaul Mullowney /* solve with reordering */ 644bda325fcSPaul Mullowney ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 645087f3262SPaul Mullowney stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat); 646087f3262SPaul Mullowney stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); 647bda325fcSPaul Mullowney ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr); 648bda325fcSPaul Mullowney 649bda325fcSPaul Mullowney /* restore */ 650bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 651bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 652bda325fcSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 653087f3262SPaul Mullowney 654087f3262SPaul Mullowney if (cusparseTriFactors->isSymmOrHerm) { 655087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 656087f3262SPaul Mullowney } else { 657bda325fcSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 658087f3262SPaul Mullowney } 659bda325fcSPaul Mullowney PetscFunctionReturn(0); 660bda325fcSPaul Mullowney } 661bda325fcSPaul Mullowney 662bda325fcSPaul Mullowney #undef __FUNCT__ 663bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 664bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 665bda325fcSPaul Mullowney { 666bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 667bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 668bda325fcSPaul Mullowney cusparseStatus_t stat; 669bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 670bda325fcSPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 671bda325fcSPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 672bda325fcSPaul Mullowney CUSPARRAY *tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 673*b175d8bbSPaul Mullowney PetscErrorCode ierr; 674bda325fcSPaul Mullowney 675bda325fcSPaul Mullowney PetscFunctionBegin; 676bda325fcSPaul Mullowney /* Analyze the matrix ... on the fly */ 677bda325fcSPaul Mullowney if (!cusparseTriFactors->hasTranspose) { 678bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 679bda325fcSPaul Mullowney cusparseTriFactors->hasTranspose=PETSC_TRUE; 680bda325fcSPaul Mullowney } 681bda325fcSPaul Mullowney 682bda325fcSPaul Mullowney /* Get the GPU pointers */ 683bda325fcSPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 684bda325fcSPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 685bda325fcSPaul Mullowney 686bda325fcSPaul Mullowney /* solve */ 687087f3262SPaul Mullowney stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat); 688087f3262SPaul Mullowney stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat); 689bda325fcSPaul Mullowney 690bda325fcSPaul Mullowney /* restore */ 691bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 692bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 693bda325fcSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 694087f3262SPaul Mullowney if (cusparseTriFactors->isSymmOrHerm) { 695087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 696087f3262SPaul Mullowney } else { 697bda325fcSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 698087f3262SPaul Mullowney } 699bda325fcSPaul Mullowney PetscFunctionReturn(0); 700bda325fcSPaul Mullowney } 701bda325fcSPaul Mullowney 7029ae82921SPaul Mullowney #undef __FUNCT__ 7039ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 7049ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 7059ae82921SPaul Mullowney { 7069ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 707bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 7089ae82921SPaul Mullowney cusparseStatus_t stat; 7099ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7109ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 7119ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 7129ae82921SPaul Mullowney CUSPARRAY *tempGPU = (CUSPARRAY*)cusparseTriFactors->tempvec; 713*b175d8bbSPaul Mullowney PetscErrorCode ierr; 7149ae82921SPaul Mullowney 7159ae82921SPaul Mullowney PetscFunctionBegin; 716e057df02SPaul Mullowney /* Get the GPU pointers */ 7179ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 7189ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 7199ae82921SPaul Mullowney 720e057df02SPaul Mullowney /* solve with reordering */ 7219ae82921SPaul Mullowney ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 7229ae82921SPaul Mullowney stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 7239ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 7249ae82921SPaul Mullowney ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 7259ae82921SPaul Mullowney 7269ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 7279ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 7289ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 729087f3262SPaul Mullowney if (cusparseTriFactors->isSymmOrHerm) { 730087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 731087f3262SPaul Mullowney } else { 7329ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 733087f3262SPaul Mullowney } 7349ae82921SPaul Mullowney PetscFunctionReturn(0); 7359ae82921SPaul Mullowney } 7369ae82921SPaul Mullowney 7379ae82921SPaul Mullowney #undef __FUNCT__ 7389ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 7399ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 7409ae82921SPaul Mullowney { 7419ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 742bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 7439ae82921SPaul Mullowney cusparseStatus_t stat; 7449ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7459ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 7469ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 7479ae82921SPaul Mullowney CUSPARRAY *tempGPU = (CUSPARRAY*)cusparseTriFactors->tempvec; 748*b175d8bbSPaul Mullowney PetscErrorCode ierr; 7499ae82921SPaul Mullowney 7509ae82921SPaul Mullowney PetscFunctionBegin; 751e057df02SPaul Mullowney /* Get the GPU pointers */ 7529ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 7539ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 7549ae82921SPaul Mullowney 755e057df02SPaul Mullowney /* solve */ 7569ae82921SPaul Mullowney stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 7579ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 7589ae82921SPaul Mullowney 7599ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 7609ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 7619ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 762087f3262SPaul Mullowney if (cusparseTriFactors->isSymmOrHerm) { 763087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr); 764087f3262SPaul Mullowney } else { 7659ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 766087f3262SPaul Mullowney } 7679ae82921SPaul Mullowney PetscFunctionReturn(0); 7689ae82921SPaul Mullowney } 7699ae82921SPaul Mullowney 7709ae82921SPaul Mullowney #undef __FUNCT__ 771e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 772e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 7739ae82921SPaul Mullowney { 7749ae82921SPaul Mullowney 7759ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 7769ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 7779ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 778087f3262SPaul Mullowney PetscBool symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE; 779087f3262SPaul Mullowney PetscBool symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE; 780*b175d8bbSPaul Mullowney PetscErrorCode ierr; 7819ae82921SPaul Mullowney 7829ae82921SPaul Mullowney PetscFunctionBegin; 7839ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 7849ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 7859ae82921SPaul Mullowney /* 7869ae82921SPaul Mullowney It may be possible to reuse nonzero structure with new matrix values but 7879ae82921SPaul Mullowney for simplicity and insured correctness we delete and build a new matrix on 7889ae82921SPaul Mullowney the GPU. Likely a very small performance hit. 7899ae82921SPaul Mullowney */ 7909ae82921SPaul Mullowney if (cusparseMat->mat) { 7919ae82921SPaul Mullowney try { 7929ae82921SPaul Mullowney delete cusparseMat->mat; 7932205254eSKarl Rupp if (cusparseMat->tempvec) delete cusparseMat->tempvec; 7949ae82921SPaul Mullowney 7959ae82921SPaul Mullowney } catch(char *ex) { 7969ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7979ae82921SPaul Mullowney } 7989ae82921SPaul Mullowney } 7999ae82921SPaul Mullowney try { 8009ae82921SPaul Mullowney cusparseMat->nonzerorow=0; 8012205254eSKarl Rupp for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 8029ae82921SPaul Mullowney 8039ae82921SPaul Mullowney if (a->compressedrow.use) { 8049ae82921SPaul Mullowney m = a->compressedrow.nrows; 8059ae82921SPaul Mullowney ii = a->compressedrow.i; 8069ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 8079ae82921SPaul Mullowney } else { 808e057df02SPaul Mullowney /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 8099ae82921SPaul Mullowney int k=0; 8109ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 8119ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 8129ae82921SPaul Mullowney ii[0]=0; 8139ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 8149ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 8159ae82921SPaul Mullowney ii[k] = a->i[j]; 8169ae82921SPaul Mullowney ridx[k]= j; 8179ae82921SPaul Mullowney k++; 8189ae82921SPaul Mullowney } 8199ae82921SPaul Mullowney } 8209ae82921SPaul Mullowney ii[cusparseMat->nonzerorow] = a->nz; 8212205254eSKarl Rupp 8229ae82921SPaul Mullowney m = cusparseMat->nonzerorow; 8239ae82921SPaul Mullowney } 8249ae82921SPaul Mullowney 825e057df02SPaul Mullowney /* Build our matrix ... first determine the GPU storage type */ 826e057df02SPaul Mullowney cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 8279ae82921SPaul Mullowney 828e057df02SPaul Mullowney /* Create the streams and events (if desired). */ 8299ae82921SPaul Mullowney PetscMPIInt size; 8309ae82921SPaul Mullowney ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 8319ae82921SPaul Mullowney ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 8329ae82921SPaul Mullowney 833087f3262SPaul Mullowney ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest); 834087f3262SPaul Mullowney if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) { 835087f3262SPaul Mullowney /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ 836087f3262SPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 837087f3262SPaul Mullowney cusparseMat->isSymmOrHerm = PETSC_FALSE; 838087f3262SPaul Mullowney } else { 839087f3262SPaul Mullowney ierr = MatIsSymmetric(A,0.0,&symmetryTest); 840087f3262SPaul Mullowney if (symmetryTest) { 841087f3262SPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 842087f3262SPaul Mullowney cusparseMat->isSymmOrHerm = PETSC_TRUE; 843087f3262SPaul Mullowney } else { 844087f3262SPaul Mullowney ierr = MatIsHermitian(A,0.0,&hermitianTest); 845087f3262SPaul Mullowney if (hermitianTest) { 846087f3262SPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 847087f3262SPaul Mullowney cusparseMat->isSymmOrHerm = PETSC_TRUE; 848087f3262SPaul Mullowney } else { 849087f3262SPaul Mullowney /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */ 850087f3262SPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 851087f3262SPaul Mullowney cusparseMat->isSymmOrHerm = PETSC_FALSE; 852087f3262SPaul Mullowney } 853087f3262SPaul Mullowney } 854087f3262SPaul Mullowney } 855ca45077fSPaul Mullowney 856e057df02SPaul Mullowney /* lastly, build the matrix */ 8579ae82921SPaul Mullowney ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 8589ae82921SPaul Mullowney cusparseMat->mat->setCPRowIndices(ridx, m); 8599ae82921SPaul Mullowney if (!a->compressedrow.use) { 8609ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 8619ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 8629ae82921SPaul Mullowney } 8639ae82921SPaul Mullowney cusparseMat->tempvec = new CUSPARRAY; 8649ae82921SPaul Mullowney cusparseMat->tempvec->resize(m); 8659ae82921SPaul Mullowney } catch(char *ex) { 8669ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 8679ae82921SPaul Mullowney } 8689ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 8692205254eSKarl Rupp 8709ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 8712205254eSKarl Rupp 8729ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 8739ae82921SPaul Mullowney } 8749ae82921SPaul Mullowney PetscFunctionReturn(0); 8759ae82921SPaul Mullowney } 8769ae82921SPaul Mullowney 8779ae82921SPaul Mullowney #undef __FUNCT__ 8789ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 8799ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 8809ae82921SPaul Mullowney { 8819ae82921SPaul Mullowney PetscErrorCode ierr; 8829ae82921SPaul Mullowney 8839ae82921SPaul Mullowney PetscFunctionBegin; 8849ae82921SPaul Mullowney if (right) { 885ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 8869ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 8879ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 8889ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 8899ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 8909ae82921SPaul Mullowney } 8919ae82921SPaul Mullowney if (left) { 892ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 8939ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 8949ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 8959ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 8969ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 8979ae82921SPaul Mullowney } 8989ae82921SPaul Mullowney PetscFunctionReturn(0); 8999ae82921SPaul Mullowney } 9009ae82921SPaul Mullowney 9019ae82921SPaul Mullowney #undef __FUNCT__ 9029ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 9039ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 9049ae82921SPaul Mullowney { 9059ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 9069ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 907bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray; 908*b175d8bbSPaul Mullowney PetscErrorCode ierr; 9099ae82921SPaul Mullowney 9109ae82921SPaul Mullowney PetscFunctionBegin; 911e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 912e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 9139ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 9149ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 9159ae82921SPaul Mullowney try { 9169ae82921SPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 9179ae82921SPaul Mullowney } catch (char *ex) { 9189ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 9199ae82921SPaul Mullowney } 9209ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 9219ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 922ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 9239ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 924ca45077fSPaul Mullowney } 925087f3262SPaul Mullowney if (cusparseMat->isSymmOrHerm) { 926087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 927087f3262SPaul Mullowney } else { 9289ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 929087f3262SPaul Mullowney } 9309ae82921SPaul Mullowney PetscFunctionReturn(0); 9319ae82921SPaul Mullowney } 9329ae82921SPaul Mullowney 9339ae82921SPaul Mullowney #undef __FUNCT__ 934ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 935ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 936ca45077fSPaul Mullowney { 937ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 938ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 939bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray; 940*b175d8bbSPaul Mullowney PetscErrorCode ierr; 941ca45077fSPaul Mullowney 942ca45077fSPaul Mullowney PetscFunctionBegin; 943e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 944e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 945bda325fcSPaul Mullowney if (!cusparseMat->hasTranspose) { 946bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 947bda325fcSPaul Mullowney cusparseMat->hasTranspose=PETSC_TRUE; 948bda325fcSPaul Mullowney } 949ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 950ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 951ca45077fSPaul Mullowney try { 952087f3262SPaul Mullowney ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr); 953ca45077fSPaul Mullowney } catch (char *ex) { 954ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 955ca45077fSPaul Mullowney } 956ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 957ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 958ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 959ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 960ca45077fSPaul Mullowney } 961087f3262SPaul Mullowney if (cusparseMat->isSymmOrHerm) { 962087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 963087f3262SPaul Mullowney } else { 964ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 965087f3262SPaul Mullowney } 966ca45077fSPaul Mullowney PetscFunctionReturn(0); 967ca45077fSPaul Mullowney } 968ca45077fSPaul Mullowney 969ca45077fSPaul Mullowney #undef __FUNCT__ 9709ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 9719ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 9729ae82921SPaul Mullowney { 9739ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 9749ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 975bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 976*b175d8bbSPaul Mullowney PetscErrorCode ierr; 9776e111a19SKarl Rupp 9789ae82921SPaul Mullowney PetscFunctionBegin; 979e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 980e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 9819ae82921SPaul Mullowney try { 9829ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 9839ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 9849ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 9859ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 9869ae82921SPaul Mullowney 987e057df02SPaul Mullowney /* multiply add */ 9889ae82921SPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 9899ae82921SPaul Mullowney 9909ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 9919ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 9929ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 9939ae82921SPaul Mullowney 9949ae82921SPaul Mullowney } catch(char *ex) { 9959ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 9969ae82921SPaul Mullowney } 9979ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 998087f3262SPaul Mullowney if (cusparseMat->isSymmOrHerm) { 999087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 1000087f3262SPaul Mullowney } else { 10019ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1002087f3262SPaul Mullowney } 10039ae82921SPaul Mullowney PetscFunctionReturn(0); 10049ae82921SPaul Mullowney } 10059ae82921SPaul Mullowney 10069ae82921SPaul Mullowney #undef __FUNCT__ 1007*b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1008ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1009ca45077fSPaul Mullowney { 1010ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1011ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 1012ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 1013*b175d8bbSPaul Mullowney PetscErrorCode ierr; 10146e111a19SKarl Rupp 1015ca45077fSPaul Mullowney PetscFunctionBegin; 1016e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1017e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1018bda325fcSPaul Mullowney if (!cusparseMat->hasTranspose) { 1019bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1020bda325fcSPaul Mullowney cusparseMat->hasTranspose=PETSC_TRUE; 1021bda325fcSPaul Mullowney } 1022ca45077fSPaul Mullowney try { 1023ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1024ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1025ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1026ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1027ca45077fSPaul Mullowney 1028e057df02SPaul Mullowney /* multiply add with matrix transpose */ 1029087f3262SPaul Mullowney ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr); 1030ca45077fSPaul Mullowney 1031ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1032ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1033ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1034ca45077fSPaul Mullowney 1035ca45077fSPaul Mullowney } catch(char *ex) { 1036ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1037ca45077fSPaul Mullowney } 1038ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1039087f3262SPaul Mullowney if (cusparseMat->isSymmOrHerm) { 1040087f3262SPaul Mullowney ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr); 1041087f3262SPaul Mullowney } else { 1042ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1043087f3262SPaul Mullowney } 1044ca45077fSPaul Mullowney PetscFunctionReturn(0); 1045ca45077fSPaul Mullowney } 1046ca45077fSPaul Mullowney 1047ca45077fSPaul Mullowney #undef __FUNCT__ 10489ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 10499ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 10509ae82921SPaul Mullowney { 10519ae82921SPaul Mullowney PetscErrorCode ierr; 10526e111a19SKarl Rupp 10539ae82921SPaul Mullowney PetscFunctionBegin; 10549ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1055e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 10569ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1057bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 1058bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1059bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1060bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 10619ae82921SPaul Mullowney PetscFunctionReturn(0); 10629ae82921SPaul Mullowney } 10639ae82921SPaul Mullowney 10649ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 10659ae82921SPaul Mullowney #undef __FUNCT__ 10669ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1067e057df02SPaul Mullowney /*@ 10689ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1069e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 1070e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1071e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 1072e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 1073e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 10749ae82921SPaul Mullowney 10759ae82921SPaul Mullowney Collective on MPI_Comm 10769ae82921SPaul Mullowney 10779ae82921SPaul Mullowney Input Parameters: 10789ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 10799ae82921SPaul Mullowney . m - number of rows 10809ae82921SPaul Mullowney . n - number of columns 10819ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 10829ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 10830298fd71SBarry Smith (possibly different for each row) or NULL 10849ae82921SPaul Mullowney 10859ae82921SPaul Mullowney Output Parameter: 10869ae82921SPaul Mullowney . A - the matrix 10879ae82921SPaul Mullowney 10889ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 10899ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 10909ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 10919ae82921SPaul Mullowney 10929ae82921SPaul Mullowney Notes: 10939ae82921SPaul Mullowney If nnz is given then nz is ignored 10949ae82921SPaul Mullowney 10959ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 10969ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 10979ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 10989ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 10999ae82921SPaul Mullowney 11009ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 11010298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 11029ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 11039ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 11049ae82921SPaul Mullowney 11059ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 11069ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 11079ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 11089ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 11099ae82921SPaul Mullowney 11109ae82921SPaul Mullowney Level: intermediate 11119ae82921SPaul Mullowney 1112e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 11139ae82921SPaul Mullowney @*/ 11149ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 11159ae82921SPaul Mullowney { 11169ae82921SPaul Mullowney PetscErrorCode ierr; 11179ae82921SPaul Mullowney 11189ae82921SPaul Mullowney PetscFunctionBegin; 11199ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 11209ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 11219ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 11229ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 11239ae82921SPaul Mullowney PetscFunctionReturn(0); 11249ae82921SPaul Mullowney } 11259ae82921SPaul Mullowney 11269ae82921SPaul Mullowney #undef __FUNCT__ 11279ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 11289ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 11299ae82921SPaul Mullowney { 11309ae82921SPaul Mullowney PetscErrorCode ierr; 11319ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 11329ae82921SPaul Mullowney 11339ae82921SPaul Mullowney PetscFunctionBegin; 11349ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 11359ae82921SPaul Mullowney try { 11369ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 11379ae82921SPaul Mullowney delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 11389ae82921SPaul Mullowney } 11392205254eSKarl Rupp if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 11409ae82921SPaul Mullowney delete cusparseMat; 11419ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 11429ae82921SPaul Mullowney } catch(char *ex) { 11439ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 11449ae82921SPaul Mullowney } 11459ae82921SPaul Mullowney } else { 1146e057df02SPaul Mullowney /* The triangular factors */ 11479ae82921SPaul Mullowney try { 11489ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 11499ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 11509ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 11519ae82921SPaul Mullowney delete (GPU_Matrix_Ifc*) cusparseMatLo; 11529ae82921SPaul Mullowney delete (GPU_Matrix_Ifc*) cusparseMatUp; 11539ae82921SPaul Mullowney delete (CUSPARRAY*) cusparseTriFactors->tempvec; 11549ae82921SPaul Mullowney delete cusparseTriFactors; 11559ae82921SPaul Mullowney } catch(char *ex) { 11569ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 11579ae82921SPaul Mullowney } 11589ae82921SPaul Mullowney } 11599ae82921SPaul Mullowney if (MAT_cusparseHandle) { 11609ae82921SPaul Mullowney cusparseStatus_t stat; 11619ae82921SPaul Mullowney stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 11622205254eSKarl Rupp 11639ae82921SPaul Mullowney MAT_cusparseHandle=0; 11649ae82921SPaul Mullowney } 11659ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 11669ae82921SPaul Mullowney A->spptr = 0; 11679ae82921SPaul Mullowney 11689ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 11699ae82921SPaul Mullowney PetscFunctionReturn(0); 11709ae82921SPaul Mullowney } 11719ae82921SPaul Mullowney 11729ae82921SPaul Mullowney #undef __FUNCT__ 11739ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 11748cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 11759ae82921SPaul Mullowney { 11769ae82921SPaul Mullowney PetscErrorCode ierr; 11779ae82921SPaul Mullowney 11789ae82921SPaul Mullowney PetscFunctionBegin; 11799ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 11809ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 1181e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 1182e057df02SPaul Mullowney now build a GPU matrix data structure */ 11839ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 11842205254eSKarl Rupp 11859ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 11869ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 1187e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1188bda325fcSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE; 1189087f3262SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE; 11909ae82921SPaul Mullowney } else { 11919ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 1192debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 11932205254eSKarl Rupp 11949ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 11959ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 11969ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 1197e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1198bda325fcSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose = PETSC_FALSE; 1199087f3262SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm = PETSC_FALSE; 12009ae82921SPaul Mullowney } 1201e057df02SPaul Mullowney /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 12029ae82921SPaul Mullowney if (!MAT_cusparseHandle) { 12039ae82921SPaul Mullowney cusparseStatus_t stat; 12049ae82921SPaul Mullowney stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 12059ae82921SPaul Mullowney } 1206e057df02SPaul Mullowney /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1207e057df02SPaul Mullowney default cusparse tri solve. Note the difference with the implementation in 1208e057df02SPaul Mullowney MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 120900de8ff0SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 12102205254eSKarl Rupp 12119ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 12129ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 12139ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 12149ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1215ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 1216ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1217ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1218ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 12192205254eSKarl Rupp 12209ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 12212205254eSKarl Rupp 12229ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 12232205254eSKarl Rupp 122400de8ff0SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 12259ae82921SPaul Mullowney PetscFunctionReturn(0); 12269ae82921SPaul Mullowney } 12279ae82921SPaul Mullowney 1228e057df02SPaul Mullowney /*M 1229e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1230e057df02SPaul Mullowney 1231e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1232e057df02SPaul Mullowney CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1233e057df02SPaul Mullowney the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 1234e057df02SPaul Mullowney Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 1235e057df02SPaul Mullowney the different GPU storage formats. 1236e057df02SPaul Mullowney 1237e057df02SPaul Mullowney Options Database Keys: 1238e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 12398468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 12408468deeeSKarl Rupp . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 12418468deeeSKarl Rupp - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 1242e057df02SPaul Mullowney 1243e057df02SPaul Mullowney Level: beginner 1244e057df02SPaul Mullowney 12458468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1246e057df02SPaul Mullowney M*/ 1247