19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 39ae82921SPaul Mullowney matrix storage format. 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 69ae82921SPaul Mullowney #include "petscconf.h" 79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN 89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 99ae82921SPaul Mullowney //#include "petscbt.h" 109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END 139ae82921SPaul Mullowney #undef VecType 149ae82921SPaul Mullowney #include "cusparsematimpl.h" 15e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 169ae82921SPaul Mullowney 17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable 18e057df02SPaul Mullowney from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19e057df02SPaul Mullowney SpMV. Essentially, I need to use the same stream variable in two different 20e057df02SPaul Mullowney data structures. I do this by creating a single instance of that stream 21e057df02SPaul Mullowney and reuse it. */ 22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0; 239ae82921SPaul Mullowney 249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 359ae82921SPaul Mullowney 369ae82921SPaul Mullowney #undef __FUNCT__ 379ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 389ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 399ae82921SPaul Mullowney { 409ae82921SPaul Mullowney PetscFunctionBegin; 419ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 429ae82921SPaul Mullowney PetscFunctionReturn(0); 439ae82921SPaul Mullowney } 449ae82921SPaul Mullowney 459ae82921SPaul Mullowney EXTERN_C_BEGIN 469ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 479ae82921SPaul Mullowney EXTERN_C_END 48e057df02SPaul Mullowney /* 499ae82921SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 509ae82921SPaul Mullowney on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 519ae82921SPaul Mullowney 529ae82921SPaul Mullowney Level: beginner 53e057df02SPaul Mullowney */ 549ae82921SPaul Mullowney 559ae82921SPaul Mullowney EXTERN_C_BEGIN 569ae82921SPaul Mullowney #undef __FUNCT__ 579ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 589ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 599ae82921SPaul Mullowney { 609ae82921SPaul Mullowney PetscErrorCode ierr; 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney PetscFunctionBegin; 639ae82921SPaul Mullowney ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 649ae82921SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 659ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 679ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 68*2205254eSKarl Rupp 699ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 709ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 719ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 729ae82921SPaul Mullowney (*B)->factortype = ftype; 739ae82921SPaul Mullowney PetscFunctionReturn(0); 749ae82921SPaul Mullowney } 759ae82921SPaul Mullowney EXTERN_C_END 769ae82921SPaul Mullowney 77e057df02SPaul Mullowney EXTERN_C_BEGIN 789ae82921SPaul Mullowney #undef __FUNCT__ 79e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 80e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 81ca45077fSPaul Mullowney { 82ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 836e111a19SKarl Rupp 84ca45077fSPaul Mullowney PetscFunctionBegin; 85ca45077fSPaul Mullowney switch (op) { 86e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 87e057df02SPaul Mullowney cusparseMat->format = format; 88ca45077fSPaul Mullowney break; 89e057df02SPaul Mullowney case MAT_CUSPARSE_SOLVE: 90e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 91ca45077fSPaul Mullowney break; 92e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 93e057df02SPaul Mullowney cusparseMat->format = format; 94e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 95ca45077fSPaul Mullowney break; 96ca45077fSPaul Mullowney default: 97e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 98ca45077fSPaul Mullowney } 99ca45077fSPaul Mullowney PetscFunctionReturn(0); 100ca45077fSPaul Mullowney } 101e057df02SPaul Mullowney EXTERN_C_END 1029ae82921SPaul Mullowney 103e057df02SPaul Mullowney 104e057df02SPaul Mullowney /*@ 105e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 106e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 1078468deeeSKarl Rupp for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 108e057df02SPaul Mullowney to build/install PETSc to use this package. 109e057df02SPaul Mullowney 110e057df02SPaul Mullowney Not Collective 111e057df02SPaul Mullowney 112e057df02SPaul Mullowney Input Parameters: 1138468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 1148468deeeSKarl Rupp . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1158468deeeSKarl Rupp - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 116e057df02SPaul Mullowney 117e057df02SPaul Mullowney Output Parameter: 118e057df02SPaul Mullowney 119e057df02SPaul Mullowney Level: intermediate 120e057df02SPaul Mullowney 1218468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 122e057df02SPaul Mullowney @*/ 123e057df02SPaul Mullowney #undef __FUNCT__ 124e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat" 125e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 126e057df02SPaul Mullowney { 127e057df02SPaul Mullowney PetscErrorCode ierr; 1286e111a19SKarl Rupp 129e057df02SPaul Mullowney PetscFunctionBegin; 130e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 131e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 132e057df02SPaul Mullowney PetscFunctionReturn(0); 133e057df02SPaul Mullowney } 134e057df02SPaul Mullowney 1359ae82921SPaul Mullowney #undef __FUNCT__ 1369ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1379ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1389ae82921SPaul Mullowney { 1399ae82921SPaul Mullowney PetscErrorCode ierr; 140e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1419ae82921SPaul Mullowney PetscBool flg; 1426e111a19SKarl Rupp 1439ae82921SPaul Mullowney PetscFunctionBegin; 144e057df02SPaul Mullowney ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 145e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 1469ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 147e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 1487083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 149e057df02SPaul Mullowney if (flg) { 150e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 151045c96e1SPaul Mullowney } 152*2205254eSKarl Rupp } else { 153e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 1547083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 155e057df02SPaul Mullowney if (flg) { 156e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 157e057df02SPaul Mullowney } 1589ae82921SPaul Mullowney } 1594c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 1607083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 1614c87dfd4SPaul Mullowney if (flg) { 1624c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 1634c87dfd4SPaul Mullowney } 1649ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 1659ae82921SPaul Mullowney PetscFunctionReturn(0); 1669ae82921SPaul Mullowney 1679ae82921SPaul Mullowney } 1689ae82921SPaul Mullowney 1699ae82921SPaul Mullowney #undef __FUNCT__ 1709ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 1719ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1729ae82921SPaul Mullowney { 1739ae82921SPaul Mullowney PetscErrorCode ierr; 1749ae82921SPaul Mullowney 1759ae82921SPaul Mullowney PetscFunctionBegin; 1769ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 177*2205254eSKarl Rupp 1789ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1799ae82921SPaul Mullowney PetscFunctionReturn(0); 1809ae82921SPaul Mullowney } 1819ae82921SPaul Mullowney 1829ae82921SPaul Mullowney #undef __FUNCT__ 1839ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 1849ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1859ae82921SPaul Mullowney { 1869ae82921SPaul Mullowney PetscErrorCode ierr; 1879ae82921SPaul Mullowney 1889ae82921SPaul Mullowney PetscFunctionBegin; 1899ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 190*2205254eSKarl Rupp 1919ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1929ae82921SPaul Mullowney PetscFunctionReturn(0); 1939ae82921SPaul Mullowney } 1949ae82921SPaul Mullowney 1959ae82921SPaul Mullowney #undef __FUNCT__ 196e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 197e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 1989ae82921SPaul Mullowney { 1999ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2009ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2019ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2029ae82921SPaul Mullowney GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 2039ae82921SPaul Mullowney cusparseStatus_t stat; 2049ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2059ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2069ae82921SPaul Mullowney PetscErrorCode ierr; 2079ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2089ae82921SPaul Mullowney PetscScalar *AALo; 2099ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 2109ae82921SPaul Mullowney 2119ae82921SPaul Mullowney PetscFunctionBegin; 2129ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2139ae82921SPaul Mullowney try { 2149ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2159ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2169ae82921SPaul Mullowney 2179ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2189ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2199ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2209ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2219ae82921SPaul Mullowney 2229ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2239ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2249ae82921SPaul Mullowney AiLo[n] = nzLower; 2259ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2269ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2279ae82921SPaul Mullowney v = aa; 2289ae82921SPaul Mullowney vi = aj; 2299ae82921SPaul Mullowney offset = 1; 2309ae82921SPaul Mullowney rowOffset= 1; 2319ae82921SPaul Mullowney for (i=1; i<n; i++) { 2329ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 233e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2349ae82921SPaul Mullowney AiLo[i] = rowOffset; 2359ae82921SPaul Mullowney rowOffset += nz+1; 2369ae82921SPaul Mullowney 2379ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 2389ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 2399ae82921SPaul Mullowney 2409ae82921SPaul Mullowney offset += nz; 2419ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 2429ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 2439ae82921SPaul Mullowney offset += 1; 2449ae82921SPaul Mullowney 2459ae82921SPaul Mullowney v += nz; 2469ae82921SPaul Mullowney vi += nz; 2479ae82921SPaul Mullowney } 248e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 249*2205254eSKarl Rupp 2509ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 2519ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 2529ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 253*2205254eSKarl Rupp 2549ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 255*2205254eSKarl Rupp 2569ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 2579ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 2589ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 2599ae82921SPaul Mullowney } catch(char* ex) { 2609ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2619ae82921SPaul Mullowney } 2629ae82921SPaul Mullowney } 2639ae82921SPaul Mullowney PetscFunctionReturn(0); 2649ae82921SPaul Mullowney } 2659ae82921SPaul Mullowney 2669ae82921SPaul Mullowney #undef __FUNCT__ 267e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 268e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 2699ae82921SPaul Mullowney { 2709ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2719ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2729ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2739ae82921SPaul Mullowney GPU_Matrix_Ifc * cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 2749ae82921SPaul Mullowney cusparseStatus_t stat; 2759ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 2769ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2779ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 2789ae82921SPaul Mullowney PetscScalar *AAUp; 2799ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 2809ae82921SPaul Mullowney PetscErrorCode ierr; 2819ae82921SPaul Mullowney 2829ae82921SPaul Mullowney PetscFunctionBegin; 2839ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2849ae82921SPaul Mullowney try { 2859ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 2869ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 2879ae82921SPaul Mullowney 2889ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 2899ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2909ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 2919ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 2929ae82921SPaul Mullowney 2939ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 2949ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 2959ae82921SPaul Mullowney AiUp[n]=nzUpper; 2969ae82921SPaul Mullowney offset = nzUpper; 2979ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 2989ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 2999ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 3009ae82921SPaul Mullowney 301e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 3029ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 3039ae82921SPaul Mullowney 304e057df02SPaul Mullowney /* decrement the offset */ 3059ae82921SPaul Mullowney offset -= (nz+1); 3069ae82921SPaul Mullowney 307e057df02SPaul Mullowney /* first, set the diagonal elements */ 3089ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 3099ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 3109ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 3119ae82921SPaul Mullowney 3129ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3139ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3149ae82921SPaul Mullowney } 315e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 316*2205254eSKarl Rupp 3179ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 3189ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 3199ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 320*2205254eSKarl Rupp 3219ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 322*2205254eSKarl Rupp 3239ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 3249ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 3259ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 3269ae82921SPaul Mullowney } catch(char* ex) { 3279ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3289ae82921SPaul Mullowney } 3299ae82921SPaul Mullowney } 3309ae82921SPaul Mullowney PetscFunctionReturn(0); 3319ae82921SPaul Mullowney } 3329ae82921SPaul Mullowney 3339ae82921SPaul Mullowney #undef __FUNCT__ 334a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 335e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 3369ae82921SPaul Mullowney { 3379ae82921SPaul Mullowney PetscErrorCode ierr; 3389ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3399ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3409ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 3419ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3429ae82921SPaul Mullowney const PetscInt *r,*c; 3439ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3449ae82921SPaul Mullowney 3459ae82921SPaul Mullowney PetscFunctionBegin; 346e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 347e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 348*2205254eSKarl Rupp 3499ae82921SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 3509ae82921SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 3519ae82921SPaul Mullowney 3529ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 353e057df02SPaul Mullowney /*lower triangular indices */ 3549ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3559ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 356*2205254eSKarl Rupp if (!row_identity) { 3579ae82921SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 358*2205254eSKarl Rupp } 3599ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3609ae82921SPaul Mullowney 361e057df02SPaul Mullowney /*upper triangular indices */ 3629ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 3639ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 364*2205254eSKarl Rupp if (!col_identity) { 3659ae82921SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 366*2205254eSKarl Rupp } 3679ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 3689ae82921SPaul Mullowney PetscFunctionReturn(0); 3699ae82921SPaul Mullowney } 3709ae82921SPaul Mullowney 3719ae82921SPaul Mullowney #undef __FUNCT__ 3729ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 3739ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 3749ae82921SPaul Mullowney { 3759ae82921SPaul Mullowney PetscErrorCode ierr; 3769ae82921SPaul Mullowney Mat_SeqAIJ *b =(Mat_SeqAIJ*)B->data; 3779ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 3789ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3799ae82921SPaul Mullowney 3809ae82921SPaul Mullowney PetscFunctionBegin; 3819ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 382e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 3839ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3849ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3859ae82921SPaul Mullowney if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 3869ae82921SPaul Mullowney else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 3878dc1d2a3SPaul Mullowney 388e057df02SPaul Mullowney /* get the triangular factors */ 389e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 3909ae82921SPaul Mullowney PetscFunctionReturn(0); 3919ae82921SPaul Mullowney } 3929ae82921SPaul Mullowney 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney #undef __FUNCT__ 3969ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 3979ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 3989ae82921SPaul Mullowney { 3999ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4009ae82921SPaul Mullowney PetscErrorCode ierr; 401da80777bSKarl Rupp CUSPARRAY *xGPU=PETSC_NULL, *bGPU=PETSC_NULL; 4029ae82921SPaul Mullowney cusparseStatus_t stat; 4039ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4049ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 4059ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 4069ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 4079ae82921SPaul Mullowney 4089ae82921SPaul Mullowney PetscFunctionBegin; 409e057df02SPaul Mullowney /* Get the GPU pointers */ 4109ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4119ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4129ae82921SPaul Mullowney 413e057df02SPaul Mullowney /* solve with reordering */ 4149ae82921SPaul Mullowney ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 4159ae82921SPaul Mullowney stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 4169ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4179ae82921SPaul Mullowney ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 4189ae82921SPaul Mullowney 4199ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4209ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4219ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4229ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4239ae82921SPaul Mullowney PetscFunctionReturn(0); 4249ae82921SPaul Mullowney } 4259ae82921SPaul Mullowney 4269ae82921SPaul Mullowney 4279ae82921SPaul Mullowney 4289ae82921SPaul Mullowney 4299ae82921SPaul Mullowney #undef __FUNCT__ 4309ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 4319ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 4329ae82921SPaul Mullowney { 4339ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4349ae82921SPaul Mullowney PetscErrorCode ierr; 435da80777bSKarl Rupp CUSPARRAY *xGPU=PETSC_NULL, *bGPU=PETSC_NULL; 4369ae82921SPaul Mullowney cusparseStatus_t stat; 4379ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4389ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 4399ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 4409ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 4419ae82921SPaul Mullowney 4429ae82921SPaul Mullowney PetscFunctionBegin; 443e057df02SPaul Mullowney /* Get the GPU pointers */ 4449ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4459ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4469ae82921SPaul Mullowney 447e057df02SPaul Mullowney /* solve */ 4489ae82921SPaul Mullowney stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 4499ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4509ae82921SPaul Mullowney 4519ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4529ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4539ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4549ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4559ae82921SPaul Mullowney PetscFunctionReturn(0); 4569ae82921SPaul Mullowney } 4579ae82921SPaul Mullowney 4589ae82921SPaul Mullowney #undef __FUNCT__ 459e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 460e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 4619ae82921SPaul Mullowney { 4629ae82921SPaul Mullowney 4639ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 4649ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4659ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 4669ae82921SPaul Mullowney PetscErrorCode ierr; 4679ae82921SPaul Mullowney 4689ae82921SPaul Mullowney 4699ae82921SPaul Mullowney PetscFunctionBegin; 4709ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 4719ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 4729ae82921SPaul Mullowney /* 4739ae82921SPaul Mullowney It may be possible to reuse nonzero structure with new matrix values but 4749ae82921SPaul Mullowney for simplicity and insured correctness we delete and build a new matrix on 4759ae82921SPaul Mullowney the GPU. Likely a very small performance hit. 4769ae82921SPaul Mullowney */ 4779ae82921SPaul Mullowney if (cusparseMat->mat) { 4789ae82921SPaul Mullowney try { 4799ae82921SPaul Mullowney delete cusparseMat->mat; 480*2205254eSKarl Rupp if (cusparseMat->tempvec) delete cusparseMat->tempvec; 4819ae82921SPaul Mullowney 4829ae82921SPaul Mullowney } catch(char *ex) { 4839ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4849ae82921SPaul Mullowney } 4859ae82921SPaul Mullowney } 4869ae82921SPaul Mullowney try { 4879ae82921SPaul Mullowney cusparseMat->nonzerorow=0; 488*2205254eSKarl Rupp for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 4899ae82921SPaul Mullowney 4909ae82921SPaul Mullowney if (a->compressedrow.use) { 4919ae82921SPaul Mullowney m = a->compressedrow.nrows; 4929ae82921SPaul Mullowney ii = a->compressedrow.i; 4939ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 4949ae82921SPaul Mullowney } else { 495e057df02SPaul Mullowney /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 4969ae82921SPaul Mullowney int k=0; 4979ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 4989ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 4999ae82921SPaul Mullowney ii[0]=0; 5009ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 5019ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 5029ae82921SPaul Mullowney ii[k] = a->i[j]; 5039ae82921SPaul Mullowney ridx[k]= j; 5049ae82921SPaul Mullowney k++; 5059ae82921SPaul Mullowney } 5069ae82921SPaul Mullowney } 5079ae82921SPaul Mullowney ii[cusparseMat->nonzerorow] = a->nz; 508*2205254eSKarl Rupp 5099ae82921SPaul Mullowney m = cusparseMat->nonzerorow; 5109ae82921SPaul Mullowney } 5119ae82921SPaul Mullowney 512e057df02SPaul Mullowney /* Build our matrix ... first determine the GPU storage type */ 513e057df02SPaul Mullowney cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 5149ae82921SPaul Mullowney 515e057df02SPaul Mullowney /* Create the streams and events (if desired). */ 5169ae82921SPaul Mullowney PetscMPIInt size; 5179ae82921SPaul Mullowney ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 5189ae82921SPaul Mullowney ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 5199ae82921SPaul Mullowney 520e057df02SPaul Mullowney /* FILL MODE UPPER is irrelevant */ 521ca45077fSPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 522ca45077fSPaul Mullowney 523e057df02SPaul Mullowney /* lastly, build the matrix */ 5249ae82921SPaul Mullowney ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 5259ae82921SPaul Mullowney cusparseMat->mat->setCPRowIndices(ridx, m); 5269ae82921SPaul Mullowney if (!a->compressedrow.use) { 5279ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 5289ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 5299ae82921SPaul Mullowney } 5309ae82921SPaul Mullowney cusparseMat->tempvec = new CUSPARRAY; 5319ae82921SPaul Mullowney cusparseMat->tempvec->resize(m); 5329ae82921SPaul Mullowney } catch(char *ex) { 5339ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5349ae82921SPaul Mullowney } 5359ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 536*2205254eSKarl Rupp 5379ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 538*2205254eSKarl Rupp 5399ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 5409ae82921SPaul Mullowney } 5419ae82921SPaul Mullowney PetscFunctionReturn(0); 5429ae82921SPaul Mullowney } 5439ae82921SPaul Mullowney 5449ae82921SPaul Mullowney #undef __FUNCT__ 5459ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 5469ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 5479ae82921SPaul Mullowney { 5489ae82921SPaul Mullowney PetscErrorCode ierr; 5499ae82921SPaul Mullowney 5509ae82921SPaul Mullowney PetscFunctionBegin; 5519ae82921SPaul Mullowney if (right) { 5529ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 5539ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 5549ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 5559ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 5569ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 5579ae82921SPaul Mullowney } 5589ae82921SPaul Mullowney if (left) { 5599ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 5609ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 5619ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 5629ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 5639ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 5649ae82921SPaul Mullowney } 5659ae82921SPaul Mullowney PetscFunctionReturn(0); 5669ae82921SPaul Mullowney } 5679ae82921SPaul Mullowney 5689ae82921SPaul Mullowney #undef __FUNCT__ 5699ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 5709ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 5719ae82921SPaul Mullowney { 5729ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5739ae82921SPaul Mullowney PetscErrorCode ierr; 5749ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 575da80777bSKarl Rupp CUSPARRAY *xarray =PETSC_NULL,*yarray=PETSC_NULL; 5769ae82921SPaul Mullowney 5779ae82921SPaul Mullowney PetscFunctionBegin; 578e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 579e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 5809ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 5819ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 5829ae82921SPaul Mullowney try { 5839ae82921SPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 5849ae82921SPaul Mullowney } catch (char *ex) { 5859ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5869ae82921SPaul Mullowney } 5879ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 5889ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 589ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 5909ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 591ca45077fSPaul Mullowney } 5929ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 5939ae82921SPaul Mullowney PetscFunctionReturn(0); 5949ae82921SPaul Mullowney } 5959ae82921SPaul Mullowney 5969ae82921SPaul Mullowney 5979ae82921SPaul Mullowney #undef __FUNCT__ 598ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 599ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 600ca45077fSPaul Mullowney { 601ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 602ca45077fSPaul Mullowney PetscErrorCode ierr; 603ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 604da80777bSKarl Rupp CUSPARRAY *xarray =PETSC_NULL,*yarray=PETSC_NULL; 605ca45077fSPaul Mullowney 606ca45077fSPaul Mullowney PetscFunctionBegin; 607e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 608e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 609ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 610ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 611ca45077fSPaul Mullowney try { 612ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 613ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 614ca45077fSPaul Mullowney #else 615ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 616ca45077fSPaul Mullowney #endif 617ca45077fSPaul Mullowney } catch (char *ex) { 618ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 619ca45077fSPaul Mullowney } 620ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 621ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 622ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 623ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 624ca45077fSPaul Mullowney } 625ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 626ca45077fSPaul Mullowney PetscFunctionReturn(0); 627ca45077fSPaul Mullowney } 628ca45077fSPaul Mullowney 629ca45077fSPaul Mullowney #undef __FUNCT__ 6309ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 6319ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 6329ae82921SPaul Mullowney { 6339ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6349ae82921SPaul Mullowney PetscErrorCode ierr; 6359ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 636da80777bSKarl Rupp CUSPARRAY *xarray = PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL; 6376e111a19SKarl Rupp 6389ae82921SPaul Mullowney PetscFunctionBegin; 639e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 640e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 6419ae82921SPaul Mullowney try { 6429ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 6439ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 6449ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 6459ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 6469ae82921SPaul Mullowney 647e057df02SPaul Mullowney /* multiply add */ 6489ae82921SPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 6499ae82921SPaul Mullowney 6509ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 6519ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 6529ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 6539ae82921SPaul Mullowney 6549ae82921SPaul Mullowney } catch(char *ex) { 6559ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6569ae82921SPaul Mullowney } 6579ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 6589ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 6599ae82921SPaul Mullowney PetscFunctionReturn(0); 6609ae82921SPaul Mullowney } 6619ae82921SPaul Mullowney 6629ae82921SPaul Mullowney #undef __FUNCT__ 663ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 664ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 665ca45077fSPaul Mullowney { 666ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 667ca45077fSPaul Mullowney PetscErrorCode ierr; 668ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 669ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 6706e111a19SKarl Rupp 671ca45077fSPaul Mullowney PetscFunctionBegin; 672e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 673e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 674ca45077fSPaul Mullowney try { 675ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 676ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 677ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 678ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 679ca45077fSPaul Mullowney 680e057df02SPaul Mullowney /* multiply add with matrix transpose */ 681ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 682ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 683ca45077fSPaul Mullowney #else 684ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 685ca45077fSPaul Mullowney #endif 686ca45077fSPaul Mullowney 687ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 688ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 689ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 690ca45077fSPaul Mullowney 691ca45077fSPaul Mullowney } catch(char *ex) { 692ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 693ca45077fSPaul Mullowney } 694ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 695ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 696ca45077fSPaul Mullowney PetscFunctionReturn(0); 697ca45077fSPaul Mullowney } 698ca45077fSPaul Mullowney 699ca45077fSPaul Mullowney #undef __FUNCT__ 7009ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 7019ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 7029ae82921SPaul Mullowney { 7039ae82921SPaul Mullowney PetscErrorCode ierr; 7046e111a19SKarl Rupp 7059ae82921SPaul Mullowney PetscFunctionBegin; 7069ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 707e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 7089ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 709bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 710bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 711bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 712bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 7139ae82921SPaul Mullowney PetscFunctionReturn(0); 7149ae82921SPaul Mullowney } 7159ae82921SPaul Mullowney 7169ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 7179ae82921SPaul Mullowney #undef __FUNCT__ 7189ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 719e057df02SPaul Mullowney /*@ 7209ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 721e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 722e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 723e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 724e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 725e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7269ae82921SPaul Mullowney 7279ae82921SPaul Mullowney Collective on MPI_Comm 7289ae82921SPaul Mullowney 7299ae82921SPaul Mullowney Input Parameters: 7309ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 7319ae82921SPaul Mullowney . m - number of rows 7329ae82921SPaul Mullowney . n - number of columns 7339ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 7349ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7359ae82921SPaul Mullowney (possibly different for each row) or PETSC_NULL 7369ae82921SPaul Mullowney 7379ae82921SPaul Mullowney Output Parameter: 7389ae82921SPaul Mullowney . A - the matrix 7399ae82921SPaul Mullowney 7409ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 7419ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 7429ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 7439ae82921SPaul Mullowney 7449ae82921SPaul Mullowney Notes: 7459ae82921SPaul Mullowney If nnz is given then nz is ignored 7469ae82921SPaul Mullowney 7479ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 7489ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 7499ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 7509ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 7519ae82921SPaul Mullowney 7529ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7539ae82921SPaul Mullowney Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 7549ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 7559ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 7569ae82921SPaul Mullowney 7579ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 7589ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 7599ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 7609ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 7619ae82921SPaul Mullowney 7629ae82921SPaul Mullowney Level: intermediate 7639ae82921SPaul Mullowney 764e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 7659ae82921SPaul Mullowney @*/ 7669ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 7679ae82921SPaul Mullowney { 7689ae82921SPaul Mullowney PetscErrorCode ierr; 7699ae82921SPaul Mullowney 7709ae82921SPaul Mullowney PetscFunctionBegin; 7719ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7729ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 7739ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 7749ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 7759ae82921SPaul Mullowney PetscFunctionReturn(0); 7769ae82921SPaul Mullowney } 7779ae82921SPaul Mullowney 7789ae82921SPaul Mullowney #undef __FUNCT__ 7799ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 7809ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 7819ae82921SPaul Mullowney { 7829ae82921SPaul Mullowney PetscErrorCode ierr; 7839ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 7849ae82921SPaul Mullowney 7859ae82921SPaul Mullowney PetscFunctionBegin; 7869ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 7879ae82921SPaul Mullowney try { 7889ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 7899ae82921SPaul Mullowney delete (GPU_Matrix_Ifc*)(cusparseMat->mat); 7909ae82921SPaul Mullowney } 791*2205254eSKarl Rupp if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec; 7929ae82921SPaul Mullowney delete cusparseMat; 7939ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 7949ae82921SPaul Mullowney } catch(char *ex) { 7959ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7969ae82921SPaul Mullowney } 7979ae82921SPaul Mullowney } else { 798e057df02SPaul Mullowney /* The triangular factors */ 7999ae82921SPaul Mullowney try { 8009ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 8019ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 8029ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 8039ae82921SPaul Mullowney delete (GPU_Matrix_Ifc*) cusparseMatLo; 8049ae82921SPaul Mullowney delete (GPU_Matrix_Ifc*) cusparseMatUp; 8059ae82921SPaul Mullowney delete (CUSPARRAY*) cusparseTriFactors->tempvec; 8069ae82921SPaul Mullowney delete cusparseTriFactors; 8079ae82921SPaul Mullowney } catch(char *ex) { 8089ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 8099ae82921SPaul Mullowney } 8109ae82921SPaul Mullowney } 8119ae82921SPaul Mullowney if (MAT_cusparseHandle) { 8129ae82921SPaul Mullowney cusparseStatus_t stat; 8139ae82921SPaul Mullowney stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 814*2205254eSKarl Rupp 8159ae82921SPaul Mullowney MAT_cusparseHandle=0; 8169ae82921SPaul Mullowney } 8179ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 8189ae82921SPaul Mullowney A->spptr = 0; 8199ae82921SPaul Mullowney 8209ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 8219ae82921SPaul Mullowney PetscFunctionReturn(0); 8229ae82921SPaul Mullowney } 8239ae82921SPaul Mullowney 8249ae82921SPaul Mullowney EXTERN_C_BEGIN 8259ae82921SPaul Mullowney #undef __FUNCT__ 8269ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 8279ae82921SPaul Mullowney PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 8289ae82921SPaul Mullowney { 8299ae82921SPaul Mullowney PetscErrorCode ierr; 8309ae82921SPaul Mullowney 8319ae82921SPaul Mullowney PetscFunctionBegin; 8329ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 8339ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 834e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 835e057df02SPaul Mullowney now build a GPU matrix data structure */ 8369ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 837*2205254eSKarl Rupp 8389ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 8399ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0; 840e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 8419ae82921SPaul Mullowney } else { 8429ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 843debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 844*2205254eSKarl Rupp 8459ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 8469ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 8479ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec = 0; 848e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 8499ae82921SPaul Mullowney } 850e057df02SPaul Mullowney /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 8519ae82921SPaul Mullowney if (!MAT_cusparseHandle) { 8529ae82921SPaul Mullowney cusparseStatus_t stat; 8539ae82921SPaul Mullowney stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 8549ae82921SPaul Mullowney } 855e057df02SPaul Mullowney /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 856e057df02SPaul Mullowney default cusparse tri solve. Note the difference with the implementation in 857e057df02SPaul Mullowney MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 8589ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 859*2205254eSKarl Rupp 8609ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 8619ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 8629ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 8639ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 864ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 865ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 866ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 867ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 868*2205254eSKarl Rupp 8699ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 870*2205254eSKarl Rupp 8719ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 872*2205254eSKarl Rupp 873e057df02SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 8749ae82921SPaul Mullowney PetscFunctionReturn(0); 8759ae82921SPaul Mullowney } 8769ae82921SPaul Mullowney EXTERN_C_END 8779ae82921SPaul Mullowney 878e057df02SPaul Mullowney /*M 879e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 880e057df02SPaul Mullowney 881e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 882e057df02SPaul Mullowney CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 883e057df02SPaul Mullowney the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 884e057df02SPaul Mullowney Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 885e057df02SPaul Mullowney the different GPU storage formats. 886e057df02SPaul Mullowney 887e057df02SPaul Mullowney Options Database Keys: 888e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 8898468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 8908468deeeSKarl Rupp . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 8918468deeeSKarl Rupp - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 892e057df02SPaul Mullowney 893e057df02SPaul Mullowney Level: beginner 894e057df02SPaul Mullowney 8958468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 896e057df02SPaul Mullowney M*/ 897