19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 39ae82921SPaul Mullowney matrix storage format. 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 69ae82921SPaul Mullowney #include "petscconf.h" 79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN 89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 99ae82921SPaul Mullowney //#include "petscbt.h" 109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END 139ae82921SPaul Mullowney #undef VecType 149ae82921SPaul Mullowney #include "cusparsematimpl.h" 15e057df02SPaul Mullowney const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 169ae82921SPaul Mullowney 17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable 18e057df02SPaul Mullowney from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19e057df02SPaul Mullowney SpMV. Essentially, I need to use the same stream variable in two different 20e057df02SPaul Mullowney data structures. I do this by creating a single instance of that stream 21e057df02SPaul Mullowney and reuse it. */ 22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0; 239ae82921SPaul Mullowney 249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 359ae82921SPaul Mullowney 369ae82921SPaul Mullowney #undef __FUNCT__ 379ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 389ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 399ae82921SPaul Mullowney { 409ae82921SPaul Mullowney PetscFunctionBegin; 419ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 429ae82921SPaul Mullowney PetscFunctionReturn(0); 439ae82921SPaul Mullowney } 449ae82921SPaul Mullowney 459ae82921SPaul Mullowney EXTERN_C_BEGIN 469ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 479ae82921SPaul Mullowney EXTERN_C_END 48e057df02SPaul Mullowney /* 499ae82921SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 509ae82921SPaul Mullowney on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 519ae82921SPaul Mullowney 529ae82921SPaul Mullowney Level: beginner 53e057df02SPaul Mullowney */ 549ae82921SPaul Mullowney 559ae82921SPaul Mullowney EXTERN_C_BEGIN 569ae82921SPaul Mullowney #undef __FUNCT__ 579ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 589ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 599ae82921SPaul Mullowney { 609ae82921SPaul Mullowney PetscErrorCode ierr; 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney PetscFunctionBegin; 639ae82921SPaul Mullowney ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 649ae82921SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 659ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 679ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 689ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 699ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 709ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 719ae82921SPaul Mullowney (*B)->factortype = ftype; 729ae82921SPaul Mullowney PetscFunctionReturn(0); 739ae82921SPaul Mullowney } 749ae82921SPaul Mullowney EXTERN_C_END 759ae82921SPaul Mullowney 76e057df02SPaul Mullowney EXTERN_C_BEGIN 779ae82921SPaul Mullowney #undef __FUNCT__ 78e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 79e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 80ca45077fSPaul Mullowney { 81ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 826e111a19SKarl Rupp 83ca45077fSPaul Mullowney PetscFunctionBegin; 84ca45077fSPaul Mullowney switch (op) { 85e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 86e057df02SPaul Mullowney cusparseMat->format = format; 87ca45077fSPaul Mullowney break; 88e057df02SPaul Mullowney case MAT_CUSPARSE_SOLVE: 89e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 90ca45077fSPaul Mullowney break; 91e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 92e057df02SPaul Mullowney cusparseMat->format = format; 93e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 94ca45077fSPaul Mullowney break; 95ca45077fSPaul Mullowney default: 96e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 97ca45077fSPaul Mullowney } 98ca45077fSPaul Mullowney PetscFunctionReturn(0); 99ca45077fSPaul Mullowney } 100e057df02SPaul Mullowney EXTERN_C_END 1019ae82921SPaul Mullowney 102e057df02SPaul Mullowney 103e057df02SPaul Mullowney /*@ 104e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 105e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 106*8468deeeSKarl Rupp for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 107e057df02SPaul Mullowney to build/install PETSc to use this package. 108e057df02SPaul Mullowney 109e057df02SPaul Mullowney Not Collective 110e057df02SPaul Mullowney 111e057df02SPaul Mullowney Input Parameters: 112*8468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 113*8468deeeSKarl Rupp . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 114*8468deeeSKarl Rupp - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 115e057df02SPaul Mullowney 116e057df02SPaul Mullowney Output Parameter: 117e057df02SPaul Mullowney 118e057df02SPaul Mullowney Level: intermediate 119e057df02SPaul Mullowney 120*8468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 121e057df02SPaul Mullowney @*/ 122e057df02SPaul Mullowney #undef __FUNCT__ 123e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat" 124e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125e057df02SPaul Mullowney { 126e057df02SPaul Mullowney PetscErrorCode ierr; 1276e111a19SKarl Rupp 128e057df02SPaul Mullowney PetscFunctionBegin; 129e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 130e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 131e057df02SPaul Mullowney PetscFunctionReturn(0); 132e057df02SPaul Mullowney } 133e057df02SPaul Mullowney 1349ae82921SPaul Mullowney #undef __FUNCT__ 1359ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1369ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1379ae82921SPaul Mullowney { 1389ae82921SPaul Mullowney PetscErrorCode ierr; 139e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1409ae82921SPaul Mullowney PetscBool flg; 1416e111a19SKarl Rupp 1429ae82921SPaul Mullowney PetscFunctionBegin; 143e057df02SPaul Mullowney ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 144e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 1459ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 146e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 1477083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 148e057df02SPaul Mullowney if (flg) { 149e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 150045c96e1SPaul Mullowney } 1519ae82921SPaul Mullowney } 1529ae82921SPaul Mullowney else { 153e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 1547083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 155e057df02SPaul Mullowney if (flg) { 156e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 157e057df02SPaul Mullowney } 1589ae82921SPaul Mullowney } 1594c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 1607083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 1614c87dfd4SPaul Mullowney if (flg) { 1624c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 1634c87dfd4SPaul Mullowney } 1649ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 1659ae82921SPaul Mullowney PetscFunctionReturn(0); 1669ae82921SPaul Mullowney 1679ae82921SPaul Mullowney } 1689ae82921SPaul Mullowney 1699ae82921SPaul Mullowney #undef __FUNCT__ 1709ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 1719ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1729ae82921SPaul Mullowney { 1739ae82921SPaul Mullowney PetscErrorCode ierr; 1749ae82921SPaul Mullowney 1759ae82921SPaul Mullowney PetscFunctionBegin; 1769ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1779ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1789ae82921SPaul Mullowney PetscFunctionReturn(0); 1799ae82921SPaul Mullowney } 1809ae82921SPaul Mullowney 1819ae82921SPaul Mullowney #undef __FUNCT__ 1829ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 1839ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1849ae82921SPaul Mullowney { 1859ae82921SPaul Mullowney PetscErrorCode ierr; 1869ae82921SPaul Mullowney 1879ae82921SPaul Mullowney PetscFunctionBegin; 1889ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1899ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1909ae82921SPaul Mullowney PetscFunctionReturn(0); 1919ae82921SPaul Mullowney } 1929ae82921SPaul Mullowney 1939ae82921SPaul Mullowney #undef __FUNCT__ 194e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 195e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 1969ae82921SPaul Mullowney { 1979ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1989ae82921SPaul Mullowney PetscInt n = A->rmap->n; 1999ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2009ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 2019ae82921SPaul Mullowney cusparseStatus_t stat; 2029ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2039ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2049ae82921SPaul Mullowney PetscErrorCode ierr; 2059ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2069ae82921SPaul Mullowney PetscScalar *AALo; 2079ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 2089ae82921SPaul Mullowney 2099ae82921SPaul Mullowney PetscFunctionBegin; 2109ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2119ae82921SPaul Mullowney try { 2129ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2139ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2149ae82921SPaul Mullowney 2159ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2169ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2179ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2189ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2199ae82921SPaul Mullowney 2209ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2219ae82921SPaul Mullowney AiLo[0]=(PetscInt) 0; 2229ae82921SPaul Mullowney AiLo[n]=nzLower; 2239ae82921SPaul Mullowney AjLo[0]=(PetscInt) 0; 2249ae82921SPaul Mullowney AALo[0]=(MatScalar) 1.0; 2259ae82921SPaul Mullowney v = aa; 2269ae82921SPaul Mullowney vi = aj; 2279ae82921SPaul Mullowney offset=1; 2289ae82921SPaul Mullowney rowOffset=1; 2299ae82921SPaul Mullowney for (i=1; i<n; i++) { 2309ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 231e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2329ae82921SPaul Mullowney AiLo[i]=rowOffset; 2339ae82921SPaul Mullowney rowOffset+=nz+1; 2349ae82921SPaul Mullowney 2359ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 2369ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 2379ae82921SPaul Mullowney 2389ae82921SPaul Mullowney offset+=nz; 2399ae82921SPaul Mullowney AjLo[offset]=(PetscInt) i; 2409ae82921SPaul Mullowney AALo[offset]=(MatScalar) 1.0; 2419ae82921SPaul Mullowney offset+=1; 2429ae82921SPaul Mullowney 2439ae82921SPaul Mullowney v += nz; 2449ae82921SPaul Mullowney vi += nz; 2459ae82921SPaul Mullowney } 246e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 2479ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 2489ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 2499ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 2509ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 2519ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 2529ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 2539ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 2549ae82921SPaul Mullowney } catch(char* ex) { 2559ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2569ae82921SPaul Mullowney } 2579ae82921SPaul Mullowney } 2589ae82921SPaul Mullowney PetscFunctionReturn(0); 2599ae82921SPaul Mullowney } 2609ae82921SPaul Mullowney 2619ae82921SPaul Mullowney #undef __FUNCT__ 262e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 263e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 2649ae82921SPaul Mullowney { 2659ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2669ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2679ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2689ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 2699ae82921SPaul Mullowney cusparseStatus_t stat; 2709ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 2719ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2729ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 2739ae82921SPaul Mullowney PetscScalar *AAUp; 2749ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 2759ae82921SPaul Mullowney PetscErrorCode ierr; 2769ae82921SPaul Mullowney 2779ae82921SPaul Mullowney PetscFunctionBegin; 2789ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2799ae82921SPaul Mullowney try { 2809ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 2819ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 2829ae82921SPaul Mullowney 2839ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 2849ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2859ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 2869ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 2879ae82921SPaul Mullowney 2889ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 2899ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 2909ae82921SPaul Mullowney AiUp[n]=nzUpper; 2919ae82921SPaul Mullowney offset = nzUpper; 2929ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 2939ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 2949ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 2959ae82921SPaul Mullowney 296e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 2979ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 2989ae82921SPaul Mullowney 299e057df02SPaul Mullowney /* decrement the offset */ 3009ae82921SPaul Mullowney offset -= (nz+1); 3019ae82921SPaul Mullowney 302e057df02SPaul Mullowney /* first, set the diagonal elements */ 3039ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 3049ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 3059ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 3069ae82921SPaul Mullowney 3079ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3089ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3099ae82921SPaul Mullowney } 310e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 3119ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 3129ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 3139ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 3149ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 3159ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 3169ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 3179ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 3189ae82921SPaul Mullowney } catch(char* ex) { 3199ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3209ae82921SPaul Mullowney } 3219ae82921SPaul Mullowney } 3229ae82921SPaul Mullowney PetscFunctionReturn(0); 3239ae82921SPaul Mullowney } 3249ae82921SPaul Mullowney 3259ae82921SPaul Mullowney #undef __FUNCT__ 326a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU" 327e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 3289ae82921SPaul Mullowney { 3299ae82921SPaul Mullowney PetscErrorCode ierr; 3309ae82921SPaul Mullowney Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 3319ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3329ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 3339ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3349ae82921SPaul Mullowney const PetscInt *r,*c; 3359ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3369ae82921SPaul Mullowney 3379ae82921SPaul Mullowney PetscFunctionBegin; 338e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 339e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 3409ae82921SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 3419ae82921SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 3429ae82921SPaul Mullowney 3439ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 344e057df02SPaul Mullowney /*lower triangular indices */ 3459ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3469ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3479ae82921SPaul Mullowney if (!row_identity) 3489ae82921SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 3499ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3509ae82921SPaul Mullowney 351e057df02SPaul Mullowney /*upper triangular indices */ 3529ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 3539ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3549ae82921SPaul Mullowney if (!col_identity) 3559ae82921SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 3569ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 3579ae82921SPaul Mullowney PetscFunctionReturn(0); 3589ae82921SPaul Mullowney } 3599ae82921SPaul Mullowney 3609ae82921SPaul Mullowney #undef __FUNCT__ 3619ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 3629ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 3639ae82921SPaul Mullowney { 3649ae82921SPaul Mullowney PetscErrorCode ierr; 3659ae82921SPaul Mullowney Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 3669ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 3679ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3689ae82921SPaul Mullowney 3699ae82921SPaul Mullowney PetscFunctionBegin; 3709ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 371e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 3729ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3739ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3749ae82921SPaul Mullowney if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 3759ae82921SPaul Mullowney else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 3768dc1d2a3SPaul Mullowney 377e057df02SPaul Mullowney /* get the triangular factors */ 378e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 3799ae82921SPaul Mullowney PetscFunctionReturn(0); 3809ae82921SPaul Mullowney } 3819ae82921SPaul Mullowney 3829ae82921SPaul Mullowney 3839ae82921SPaul Mullowney 3849ae82921SPaul Mullowney #undef __FUNCT__ 3859ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 3869ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 3879ae82921SPaul Mullowney { 3889ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3899ae82921SPaul Mullowney PetscErrorCode ierr; 390da80777bSKarl Rupp CUSPARRAY *xGPU=PETSC_NULL, *bGPU=PETSC_NULL; 3919ae82921SPaul Mullowney cusparseStatus_t stat; 3929ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3939ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 3949ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 3959ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 3969ae82921SPaul Mullowney 3979ae82921SPaul Mullowney PetscFunctionBegin; 398e057df02SPaul Mullowney /* Get the GPU pointers */ 3999ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4009ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4019ae82921SPaul Mullowney 402e057df02SPaul Mullowney /* solve with reordering */ 4039ae82921SPaul Mullowney ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 4049ae82921SPaul Mullowney stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 4059ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4069ae82921SPaul Mullowney ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 4079ae82921SPaul Mullowney 4089ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4099ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4109ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4119ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4129ae82921SPaul Mullowney PetscFunctionReturn(0); 4139ae82921SPaul Mullowney } 4149ae82921SPaul Mullowney 4159ae82921SPaul Mullowney 4169ae82921SPaul Mullowney 4179ae82921SPaul Mullowney 4189ae82921SPaul Mullowney #undef __FUNCT__ 4199ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 4209ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 4219ae82921SPaul Mullowney { 4229ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4239ae82921SPaul Mullowney PetscErrorCode ierr; 424da80777bSKarl Rupp CUSPARRAY *xGPU=PETSC_NULL, *bGPU=PETSC_NULL; 4259ae82921SPaul Mullowney cusparseStatus_t stat; 4269ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4279ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 4289ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 4299ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 4309ae82921SPaul Mullowney 4319ae82921SPaul Mullowney PetscFunctionBegin; 432e057df02SPaul Mullowney /* Get the GPU pointers */ 4339ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4349ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4359ae82921SPaul Mullowney 436e057df02SPaul Mullowney /* solve */ 4379ae82921SPaul Mullowney stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 4389ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4399ae82921SPaul Mullowney 4409ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4419ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4429ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4439ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4449ae82921SPaul Mullowney PetscFunctionReturn(0); 4459ae82921SPaul Mullowney } 4469ae82921SPaul Mullowney 4479ae82921SPaul Mullowney #undef __FUNCT__ 448e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 449e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 4509ae82921SPaul Mullowney { 4519ae82921SPaul Mullowney 4529ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 4539ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4549ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 4559ae82921SPaul Mullowney PetscErrorCode ierr; 4569ae82921SPaul Mullowney 4579ae82921SPaul Mullowney 4589ae82921SPaul Mullowney PetscFunctionBegin; 4599ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 4609ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 4619ae82921SPaul Mullowney /* 4629ae82921SPaul Mullowney It may be possible to reuse nonzero structure with new matrix values but 4639ae82921SPaul Mullowney for simplicity and insured correctness we delete and build a new matrix on 4649ae82921SPaul Mullowney the GPU. Likely a very small performance hit. 4659ae82921SPaul Mullowney */ 4669ae82921SPaul Mullowney if (cusparseMat->mat) { 4679ae82921SPaul Mullowney try { 4689ae82921SPaul Mullowney delete cusparseMat->mat; 4699ae82921SPaul Mullowney if (cusparseMat->tempvec) 4709ae82921SPaul Mullowney delete cusparseMat->tempvec; 4719ae82921SPaul Mullowney 4729ae82921SPaul Mullowney } catch(char* ex) { 4739ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4749ae82921SPaul Mullowney } 4759ae82921SPaul Mullowney } 4769ae82921SPaul Mullowney try { 4779ae82921SPaul Mullowney cusparseMat->nonzerorow=0; 4789ae82921SPaul Mullowney for (int j = 0; j<m; j++) 4799ae82921SPaul Mullowney cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 4809ae82921SPaul Mullowney 4819ae82921SPaul Mullowney if (a->compressedrow.use) { 4829ae82921SPaul Mullowney m = a->compressedrow.nrows; 4839ae82921SPaul Mullowney ii = a->compressedrow.i; 4849ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 4859ae82921SPaul Mullowney } else { 486e057df02SPaul Mullowney /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 4879ae82921SPaul Mullowney int k=0; 4889ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 4899ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 4909ae82921SPaul Mullowney ii[0]=0; 4919ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 4929ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 4939ae82921SPaul Mullowney ii[k] = a->i[j]; 4949ae82921SPaul Mullowney ridx[k]= j; 4959ae82921SPaul Mullowney k++; 4969ae82921SPaul Mullowney } 4979ae82921SPaul Mullowney } 4989ae82921SPaul Mullowney ii[cusparseMat->nonzerorow] = a->nz; 4999ae82921SPaul Mullowney m = cusparseMat->nonzerorow; 5009ae82921SPaul Mullowney } 5019ae82921SPaul Mullowney 502e057df02SPaul Mullowney /* Build our matrix ... first determine the GPU storage type */ 503e057df02SPaul Mullowney cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 5049ae82921SPaul Mullowney 505e057df02SPaul Mullowney /* Create the streams and events (if desired). */ 5069ae82921SPaul Mullowney PetscMPIInt size; 5079ae82921SPaul Mullowney ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 5089ae82921SPaul Mullowney ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 5099ae82921SPaul Mullowney 510e057df02SPaul Mullowney /* FILL MODE UPPER is irrelevant */ 511ca45077fSPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 512ca45077fSPaul Mullowney 513e057df02SPaul Mullowney /* lastly, build the matrix */ 5149ae82921SPaul Mullowney ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 5159ae82921SPaul Mullowney cusparseMat->mat->setCPRowIndices(ridx, m); 5169ae82921SPaul Mullowney if (!a->compressedrow.use) { 5179ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 5189ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 5199ae82921SPaul Mullowney } 5209ae82921SPaul Mullowney cusparseMat->tempvec = new CUSPARRAY; 5219ae82921SPaul Mullowney cusparseMat->tempvec->resize(m); 5229ae82921SPaul Mullowney } catch(char* ex) { 5239ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5249ae82921SPaul Mullowney } 5259ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 5269ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 5279ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 5289ae82921SPaul Mullowney } 5299ae82921SPaul Mullowney PetscFunctionReturn(0); 5309ae82921SPaul Mullowney } 5319ae82921SPaul Mullowney 5329ae82921SPaul Mullowney #undef __FUNCT__ 5339ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 5349ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 5359ae82921SPaul Mullowney { 5369ae82921SPaul Mullowney PetscErrorCode ierr; 5379ae82921SPaul Mullowney 5389ae82921SPaul Mullowney PetscFunctionBegin; 5399ae82921SPaul Mullowney if (right) { 5409ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 5419ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 5429ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 5439ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 5449ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 5459ae82921SPaul Mullowney } 5469ae82921SPaul Mullowney if (left) { 5479ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 5489ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 5499ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 5509ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 5519ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 5529ae82921SPaul Mullowney } 5539ae82921SPaul Mullowney PetscFunctionReturn(0); 5549ae82921SPaul Mullowney } 5559ae82921SPaul Mullowney 5569ae82921SPaul Mullowney #undef __FUNCT__ 5579ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 5589ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 5599ae82921SPaul Mullowney { 5609ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5619ae82921SPaul Mullowney PetscErrorCode ierr; 5629ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 563da80777bSKarl Rupp CUSPARRAY *xarray=PETSC_NULL,*yarray=PETSC_NULL; 5649ae82921SPaul Mullowney 5659ae82921SPaul Mullowney PetscFunctionBegin; 566e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 567e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 5689ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 5699ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 5709ae82921SPaul Mullowney try { 5719ae82921SPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 5729ae82921SPaul Mullowney } catch (char* ex) { 5739ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5749ae82921SPaul Mullowney } 5759ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 5769ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 577ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 5789ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 579ca45077fSPaul Mullowney } 5809ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 5819ae82921SPaul Mullowney PetscFunctionReturn(0); 5829ae82921SPaul Mullowney } 5839ae82921SPaul Mullowney 5849ae82921SPaul Mullowney 5859ae82921SPaul Mullowney #undef __FUNCT__ 586ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 587ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 588ca45077fSPaul Mullowney { 589ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 590ca45077fSPaul Mullowney PetscErrorCode ierr; 591ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 592da80777bSKarl Rupp CUSPARRAY *xarray=PETSC_NULL,*yarray=PETSC_NULL; 593ca45077fSPaul Mullowney 594ca45077fSPaul Mullowney PetscFunctionBegin; 595e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 596e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 597ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 598ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 599ca45077fSPaul Mullowney try { 600ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 601ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 602ca45077fSPaul Mullowney #else 603ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 604ca45077fSPaul Mullowney #endif 605ca45077fSPaul Mullowney } catch (char* ex) { 606ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 607ca45077fSPaul Mullowney } 608ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 609ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 610ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 611ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 612ca45077fSPaul Mullowney } 613ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 614ca45077fSPaul Mullowney PetscFunctionReturn(0); 615ca45077fSPaul Mullowney } 616ca45077fSPaul Mullowney 617ca45077fSPaul Mullowney #undef __FUNCT__ 6189ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 6199ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 6209ae82921SPaul Mullowney { 6219ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6229ae82921SPaul Mullowney PetscErrorCode ierr; 6239ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 624da80777bSKarl Rupp CUSPARRAY *xarray=PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL; 6256e111a19SKarl Rupp 6269ae82921SPaul Mullowney PetscFunctionBegin; 627e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 628e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 6299ae82921SPaul Mullowney try { 6309ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 6319ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 6329ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 6339ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 6349ae82921SPaul Mullowney 635e057df02SPaul Mullowney /* multiply add */ 6369ae82921SPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 6379ae82921SPaul Mullowney 6389ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 6399ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 6409ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 6419ae82921SPaul Mullowney 6429ae82921SPaul Mullowney } catch(char* ex) { 6439ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6449ae82921SPaul Mullowney } 6459ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 6469ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 6479ae82921SPaul Mullowney PetscFunctionReturn(0); 6489ae82921SPaul Mullowney } 6499ae82921SPaul Mullowney 6509ae82921SPaul Mullowney #undef __FUNCT__ 651ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 652ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 653ca45077fSPaul Mullowney { 654ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 655ca45077fSPaul Mullowney PetscErrorCode ierr; 656ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 657ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 6586e111a19SKarl Rupp 659ca45077fSPaul Mullowney PetscFunctionBegin; 660e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 661e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 662ca45077fSPaul Mullowney try { 663ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 664ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 665ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 666ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 667ca45077fSPaul Mullowney 668e057df02SPaul Mullowney /* multiply add with matrix transpose */ 669ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 670ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 671ca45077fSPaul Mullowney #else 672ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 673ca45077fSPaul Mullowney #endif 674ca45077fSPaul Mullowney 675ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 676ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 677ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 678ca45077fSPaul Mullowney 679ca45077fSPaul Mullowney } catch(char* ex) { 680ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 681ca45077fSPaul Mullowney } 682ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 683ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 684ca45077fSPaul Mullowney PetscFunctionReturn(0); 685ca45077fSPaul Mullowney } 686ca45077fSPaul Mullowney 687ca45077fSPaul Mullowney #undef __FUNCT__ 6889ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 6899ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 6909ae82921SPaul Mullowney { 6919ae82921SPaul Mullowney PetscErrorCode ierr; 6926e111a19SKarl Rupp 6939ae82921SPaul Mullowney PetscFunctionBegin; 6949ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 695e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 6969ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 697bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 698bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 699bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 700bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 7019ae82921SPaul Mullowney PetscFunctionReturn(0); 7029ae82921SPaul Mullowney } 7039ae82921SPaul Mullowney 7049ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 7059ae82921SPaul Mullowney #undef __FUNCT__ 7069ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 707e057df02SPaul Mullowney /*@ 7089ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 709e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 710e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 711e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 712e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 713e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7149ae82921SPaul Mullowney 7159ae82921SPaul Mullowney Collective on MPI_Comm 7169ae82921SPaul Mullowney 7179ae82921SPaul Mullowney Input Parameters: 7189ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 7199ae82921SPaul Mullowney . m - number of rows 7209ae82921SPaul Mullowney . n - number of columns 7219ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 7229ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7239ae82921SPaul Mullowney (possibly different for each row) or PETSC_NULL 7249ae82921SPaul Mullowney 7259ae82921SPaul Mullowney Output Parameter: 7269ae82921SPaul Mullowney . A - the matrix 7279ae82921SPaul Mullowney 7289ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 7299ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 7309ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 7319ae82921SPaul Mullowney 7329ae82921SPaul Mullowney Notes: 7339ae82921SPaul Mullowney If nnz is given then nz is ignored 7349ae82921SPaul Mullowney 7359ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 7369ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 7379ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 7389ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 7399ae82921SPaul Mullowney 7409ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7419ae82921SPaul Mullowney Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 7429ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 7439ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 7449ae82921SPaul Mullowney 7459ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 7469ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 7479ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 7489ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 7499ae82921SPaul Mullowney 7509ae82921SPaul Mullowney Level: intermediate 7519ae82921SPaul Mullowney 752e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 7539ae82921SPaul Mullowney @*/ 7549ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 7559ae82921SPaul Mullowney { 7569ae82921SPaul Mullowney PetscErrorCode ierr; 7579ae82921SPaul Mullowney 7589ae82921SPaul Mullowney PetscFunctionBegin; 7599ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7609ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 7619ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 7629ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 7639ae82921SPaul Mullowney PetscFunctionReturn(0); 7649ae82921SPaul Mullowney } 7659ae82921SPaul Mullowney 7669ae82921SPaul Mullowney #undef __FUNCT__ 7679ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 7689ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 7699ae82921SPaul Mullowney { 7709ae82921SPaul Mullowney PetscErrorCode ierr; 7719ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 7729ae82921SPaul Mullowney 7739ae82921SPaul Mullowney PetscFunctionBegin; 7749ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 7759ae82921SPaul Mullowney try { 7769ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 7779ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 7789ae82921SPaul Mullowney } 7799ae82921SPaul Mullowney if (cusparseMat->tempvec!=0) 7809ae82921SPaul Mullowney delete cusparseMat->tempvec; 7819ae82921SPaul Mullowney delete cusparseMat; 7829ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 7839ae82921SPaul Mullowney } catch(char* ex) { 7849ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7859ae82921SPaul Mullowney } 7869ae82921SPaul Mullowney } else { 787e057df02SPaul Mullowney /* The triangular factors */ 7889ae82921SPaul Mullowney try { 7899ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7909ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 7919ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 7929ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatLo; 7939ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatUp; 7949ae82921SPaul Mullowney delete (CUSPARRAY*) cusparseTriFactors->tempvec; 7959ae82921SPaul Mullowney delete cusparseTriFactors; 7969ae82921SPaul Mullowney } catch(char* ex) { 7979ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7989ae82921SPaul Mullowney } 7999ae82921SPaul Mullowney } 8009ae82921SPaul Mullowney if (MAT_cusparseHandle) { 8019ae82921SPaul Mullowney cusparseStatus_t stat; 8029ae82921SPaul Mullowney stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 8039ae82921SPaul Mullowney MAT_cusparseHandle=0; 8049ae82921SPaul Mullowney } 8059ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 8069ae82921SPaul Mullowney A->spptr = 0; 8079ae82921SPaul Mullowney 8089ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 8099ae82921SPaul Mullowney PetscFunctionReturn(0); 8109ae82921SPaul Mullowney } 8119ae82921SPaul Mullowney 8129ae82921SPaul Mullowney EXTERN_C_BEGIN 8139ae82921SPaul Mullowney #undef __FUNCT__ 8149ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 8159ae82921SPaul Mullowney PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 8169ae82921SPaul Mullowney { 8179ae82921SPaul Mullowney PetscErrorCode ierr; 8189ae82921SPaul Mullowney 8199ae82921SPaul Mullowney PetscFunctionBegin; 8209ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 8219ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 822e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 823e057df02SPaul Mullowney now build a GPU matrix data structure */ 8249ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 8259ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 8269ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 827e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR; 8289ae82921SPaul Mullowney } else { 8299ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 830debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 8319ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 8329ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 8339ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 834e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat; 8359ae82921SPaul Mullowney } 836e057df02SPaul Mullowney /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 8379ae82921SPaul Mullowney if (!MAT_cusparseHandle) { 8389ae82921SPaul Mullowney cusparseStatus_t stat; 8399ae82921SPaul Mullowney stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 8409ae82921SPaul Mullowney } 841e057df02SPaul Mullowney /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 842e057df02SPaul Mullowney default cusparse tri solve. Note the difference with the implementation in 843e057df02SPaul Mullowney MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 8449ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 8459ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 8469ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 8479ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 8489ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 849ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 850ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 851ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 852ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 8539ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 8549ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 855e057df02SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 8569ae82921SPaul Mullowney PetscFunctionReturn(0); 8579ae82921SPaul Mullowney } 8589ae82921SPaul Mullowney EXTERN_C_END 8599ae82921SPaul Mullowney 860e057df02SPaul Mullowney /*M 861e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 862e057df02SPaul Mullowney 863e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 864e057df02SPaul Mullowney CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 865e057df02SPaul Mullowney the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 866e057df02SPaul Mullowney Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 867e057df02SPaul Mullowney the different GPU storage formats. 868e057df02SPaul Mullowney 869e057df02SPaul Mullowney Options Database Keys: 870e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 871*8468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 872*8468deeeSKarl Rupp . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 873*8468deeeSKarl Rupp - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package. 874e057df02SPaul Mullowney 875e057df02SPaul Mullowney Level: beginner 876e057df02SPaul Mullowney 877*8468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 878e057df02SPaul Mullowney M*/ 879