xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 8cc058d9cd56c1ccb3be12a47760ddfc446aaffc)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
15e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
169ae82921SPaul Mullowney 
17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
18e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
20e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
21e057df02SPaul Mullowney    and reuse it. */
22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
319ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
32e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
338dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
348dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
358dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
368dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
379ae82921SPaul Mullowney 
389ae82921SPaul Mullowney #undef __FUNCT__
399ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
409ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
419ae82921SPaul Mullowney {
429ae82921SPaul Mullowney   PetscFunctionBegin;
439ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
449ae82921SPaul Mullowney   PetscFunctionReturn(0);
459ae82921SPaul Mullowney }
469ae82921SPaul Mullowney 
479ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
48b2573a8aSBarry Smith 
49c708e6cdSJed Brown /*MC
50c708e6cdSJed Brown   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices on a single GPU of type,
51c708e6cdSJed Brown   seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported algorithms are ILU(k) and LU. ICC(k) and
52c708e6cdSJed Brown   Cholesky will be supported in future versions. This class does NOT support direct solver operations.
53c708e6cdSJed Brown 
54c708e6cdSJed Brown   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
55c708e6cdSJed Brown 
56c708e6cdSJed Brown   Consult CUSPARSE documentation for more information about the matrix storage formats which correspond to the options
57c708e6cdSJed Brown   database keys below.
58c708e6cdSJed Brown 
59c708e6cdSJed Brown   Options Database Keys:
60c708e6cdSJed Brown .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   Level: beginner
63c708e6cdSJed Brown 
64c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
65c708e6cdSJed Brown M*/
669ae82921SPaul Mullowney 
679ae82921SPaul Mullowney #undef __FUNCT__
689ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
69*8cc058d9SJed Brown PETSC_EXTERN PETSC_EXTERN_C PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
709ae82921SPaul Mullowney {
719ae82921SPaul Mullowney   PetscErrorCode ierr;
729ae82921SPaul Mullowney 
739ae82921SPaul Mullowney   PetscFunctionBegin;
749ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
759ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
769ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
779ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
7800de8ff0SBarry Smith     ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
792205254eSKarl Rupp 
809ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
819ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
829ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
839ae82921SPaul Mullowney   (*B)->factortype = ftype;
849ae82921SPaul Mullowney   PetscFunctionReturn(0);
859ae82921SPaul Mullowney }
869ae82921SPaul Mullowney 
879ae82921SPaul Mullowney #undef __FUNCT__
88e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
89e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
90ca45077fSPaul Mullowney {
91ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
926e111a19SKarl Rupp 
93ca45077fSPaul Mullowney   PetscFunctionBegin;
94ca45077fSPaul Mullowney   switch (op) {
95e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
96e057df02SPaul Mullowney     cusparseMat->format = format;
97ca45077fSPaul Mullowney     break;
98e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
99e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
100ca45077fSPaul Mullowney     break;
101e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
102e057df02SPaul Mullowney     cusparseMat->format           = format;
103e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
104ca45077fSPaul Mullowney     break;
105ca45077fSPaul Mullowney   default:
106e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
107ca45077fSPaul Mullowney   }
108ca45077fSPaul Mullowney   PetscFunctionReturn(0);
109ca45077fSPaul Mullowney }
1109ae82921SPaul Mullowney 
111e057df02SPaul Mullowney 
112e057df02SPaul Mullowney /*@
113e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
114e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
1158468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
116e057df02SPaul Mullowney    to build/install PETSc to use this package.
117e057df02SPaul Mullowney 
118e057df02SPaul Mullowney    Not Collective
119e057df02SPaul Mullowney 
120e057df02SPaul Mullowney    Input Parameters:
1218468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1228468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1238468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
124e057df02SPaul Mullowney 
125e057df02SPaul Mullowney    Output Parameter:
126e057df02SPaul Mullowney 
127e057df02SPaul Mullowney    Level: intermediate
128e057df02SPaul Mullowney 
1298468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
130e057df02SPaul Mullowney @*/
131e057df02SPaul Mullowney #undef __FUNCT__
132e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
133e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
134e057df02SPaul Mullowney {
135e057df02SPaul Mullowney   PetscErrorCode ierr;
1366e111a19SKarl Rupp 
137e057df02SPaul Mullowney   PetscFunctionBegin;
138e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
139e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
140e057df02SPaul Mullowney   PetscFunctionReturn(0);
141e057df02SPaul Mullowney }
142e057df02SPaul Mullowney 
1439ae82921SPaul Mullowney #undef __FUNCT__
1449ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1459ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1469ae82921SPaul Mullowney {
1479ae82921SPaul Mullowney   PetscErrorCode           ierr;
148e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1499ae82921SPaul Mullowney   PetscBool                flg;
1506e111a19SKarl Rupp 
1519ae82921SPaul Mullowney   PetscFunctionBegin;
152e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
153e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1549ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
155e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1567083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
157e057df02SPaul Mullowney     if (flg) {
158e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
159045c96e1SPaul Mullowney     }
1602205254eSKarl Rupp   } else {
161e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1627083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
163e057df02SPaul Mullowney     if (flg) {
164e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
165e057df02SPaul Mullowney     }
1669ae82921SPaul Mullowney   }
1674c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1687083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1694c87dfd4SPaul Mullowney   if (flg) {
1704c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1714c87dfd4SPaul Mullowney   }
1729ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1739ae82921SPaul Mullowney   PetscFunctionReturn(0);
1749ae82921SPaul Mullowney 
1759ae82921SPaul Mullowney }
1769ae82921SPaul Mullowney 
1779ae82921SPaul Mullowney #undef __FUNCT__
1789ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1799ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1809ae82921SPaul Mullowney {
1819ae82921SPaul Mullowney   PetscErrorCode ierr;
1829ae82921SPaul Mullowney 
1839ae82921SPaul Mullowney   PetscFunctionBegin;
1849ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1852205254eSKarl Rupp 
1869ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1879ae82921SPaul Mullowney   PetscFunctionReturn(0);
1889ae82921SPaul Mullowney }
1899ae82921SPaul Mullowney 
1909ae82921SPaul Mullowney #undef __FUNCT__
1919ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1929ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1939ae82921SPaul Mullowney {
1949ae82921SPaul Mullowney   PetscErrorCode ierr;
1959ae82921SPaul Mullowney 
1969ae82921SPaul Mullowney   PetscFunctionBegin;
1979ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1982205254eSKarl Rupp 
1999ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2009ae82921SPaul Mullowney   PetscFunctionReturn(0);
2019ae82921SPaul Mullowney }
2029ae82921SPaul Mullowney 
2039ae82921SPaul Mullowney #undef __FUNCT__
204e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
205e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
2069ae82921SPaul Mullowney {
2079ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2089ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2109ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2119ae82921SPaul Mullowney   cusparseStatus_t             stat;
2129ae82921SPaul Mullowney   const PetscInt               *ai = a->i,*aj = a->j,*vi;
2139ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2149ae82921SPaul Mullowney   PetscErrorCode               ierr;
2159ae82921SPaul Mullowney   PetscInt                     *AiLo, *AjLo;
2169ae82921SPaul Mullowney   PetscScalar                  *AALo;
2179ae82921SPaul Mullowney   PetscInt                     i,nz, nzLower, offset, rowOffset;
2189ae82921SPaul Mullowney 
2199ae82921SPaul Mullowney   PetscFunctionBegin;
2209ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2219ae82921SPaul Mullowney     try {
2229ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2239ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2249ae82921SPaul Mullowney 
2259ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2269ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2279ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2289ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2299ae82921SPaul Mullowney 
2309ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2319ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2329ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2339ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2349ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2359ae82921SPaul Mullowney       v        = aa;
2369ae82921SPaul Mullowney       vi       = aj;
2379ae82921SPaul Mullowney       offset   = 1;
2389ae82921SPaul Mullowney       rowOffset= 1;
2399ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2409ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
241e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2429ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2439ae82921SPaul Mullowney         rowOffset += nz+1;
2449ae82921SPaul Mullowney 
2459ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2469ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2479ae82921SPaul Mullowney 
2489ae82921SPaul Mullowney         offset      += nz;
2499ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2509ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2519ae82921SPaul Mullowney         offset      += 1;
2529ae82921SPaul Mullowney 
2539ae82921SPaul Mullowney         v  += nz;
2549ae82921SPaul Mullowney         vi += nz;
2559ae82921SPaul Mullowney       }
256e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2572205254eSKarl Rupp 
258bda325fcSPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2599ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2609ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2612205254eSKarl Rupp 
2629ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2632205254eSKarl Rupp 
2649ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2659ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2669ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2679ae82921SPaul Mullowney     } catch(char *ex) {
2689ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2699ae82921SPaul Mullowney     }
2709ae82921SPaul Mullowney   }
2719ae82921SPaul Mullowney   PetscFunctionReturn(0);
2729ae82921SPaul Mullowney }
2739ae82921SPaul Mullowney 
2749ae82921SPaul Mullowney #undef __FUNCT__
275e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
276e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
2779ae82921SPaul Mullowney {
2789ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2799ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2809ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2819ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2829ae82921SPaul Mullowney   cusparseStatus_t             stat;
2839ae82921SPaul Mullowney   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
2849ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2859ae82921SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
2869ae82921SPaul Mullowney   PetscScalar                  *AAUp;
2879ae82921SPaul Mullowney   PetscInt                     i,nz, nzUpper, offset;
2889ae82921SPaul Mullowney   PetscErrorCode               ierr;
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney   PetscFunctionBegin;
2919ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2929ae82921SPaul Mullowney     try {
2939ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
2949ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
2959ae82921SPaul Mullowney 
2969ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
2979ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2989ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
2999ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3009ae82921SPaul Mullowney 
3019ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3029ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3039ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3049ae82921SPaul Mullowney       offset = nzUpper;
3059ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3069ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3079ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3089ae82921SPaul Mullowney 
309e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3109ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3119ae82921SPaul Mullowney 
312e057df02SPaul Mullowney         /* decrement the offset */
3139ae82921SPaul Mullowney         offset -= (nz+1);
3149ae82921SPaul Mullowney 
315e057df02SPaul Mullowney         /* first, set the diagonal elements */
3169ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3179ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3189ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3199ae82921SPaul Mullowney 
3209ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3219ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3229ae82921SPaul Mullowney       }
323e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3242205254eSKarl Rupp 
325bda325fcSPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3269ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3279ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3282205254eSKarl Rupp 
3299ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3302205254eSKarl Rupp 
3319ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3329ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3339ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3349ae82921SPaul Mullowney     } catch(char *ex) {
3359ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3369ae82921SPaul Mullowney     }
3379ae82921SPaul Mullowney   }
3389ae82921SPaul Mullowney   PetscFunctionReturn(0);
3399ae82921SPaul Mullowney }
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney #undef __FUNCT__
342a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
343e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
3449ae82921SPaul Mullowney {
3459ae82921SPaul Mullowney   PetscErrorCode               ierr;
3469ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3479ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3489ae82921SPaul Mullowney   IS                           isrow               = a->row,iscol = a->icol;
3499ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
3509ae82921SPaul Mullowney   const PetscInt               *r,*c;
3519ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
3529ae82921SPaul Mullowney 
3539ae82921SPaul Mullowney   PetscFunctionBegin;
354e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
355e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3562205254eSKarl Rupp 
3579ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3589ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3599ae82921SPaul Mullowney 
3609ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
361e057df02SPaul Mullowney   /*lower triangular indices */
3629ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3639ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3642205254eSKarl Rupp   if (!row_identity) {
3659ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3662205254eSKarl Rupp   }
3679ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3689ae82921SPaul Mullowney 
369e057df02SPaul Mullowney   /*upper triangular indices */
3709ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3719ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3722205254eSKarl Rupp   if (!col_identity) {
3739ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3742205254eSKarl Rupp   }
3759ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3769ae82921SPaul Mullowney   PetscFunctionReturn(0);
3779ae82921SPaul Mullowney }
3789ae82921SPaul Mullowney 
3799ae82921SPaul Mullowney #undef __FUNCT__
3809ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3819ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3829ae82921SPaul Mullowney {
3839ae82921SPaul Mullowney   PetscErrorCode ierr;
3849ae82921SPaul Mullowney   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
3859ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
3869ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
3879ae82921SPaul Mullowney 
3889ae82921SPaul Mullowney   PetscFunctionBegin;
3899ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
390e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
3919ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3929ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
393bda325fcSPaul Mullowney   if (row_identity && col_identity) {
394bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
395bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
396bda325fcSPaul Mullowney   } else {
397bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
398bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
399bda325fcSPaul Mullowney   }
4008dc1d2a3SPaul Mullowney 
401e057df02SPaul Mullowney   /* get the triangular factors */
402e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
4039ae82921SPaul Mullowney   PetscFunctionReturn(0);
4049ae82921SPaul Mullowney }
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney 
407bda325fcSPaul Mullowney #undef __FUNCT__
408bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
409bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
410bda325fcSPaul Mullowney {
411bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
412bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
413bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
414bda325fcSPaul Mullowney   cusparseStatus_t stat;
415bda325fcSPaul Mullowney   PetscFunctionBegin;
416bda325fcSPaul Mullowney   stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle,
417bda325fcSPaul Mullowney                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
418bda325fcSPaul Mullowney                                               CUSPARSE_FILL_MODE_UPPER,
419bda325fcSPaul Mullowney                                               TRANSPOSE);CHKERRCUSP(stat);
420bda325fcSPaul Mullowney   stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
421bda325fcSPaul Mullowney 
422bda325fcSPaul Mullowney   stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle,
423bda325fcSPaul Mullowney                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
424bda325fcSPaul Mullowney                                               CUSPARSE_FILL_MODE_LOWER,
425bda325fcSPaul Mullowney                                               TRANSPOSE);CHKERRCUSP(stat);
426bda325fcSPaul Mullowney   stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
427bda325fcSPaul Mullowney   PetscFunctionReturn(0);
428bda325fcSPaul Mullowney }
429bda325fcSPaul Mullowney 
430bda325fcSPaul Mullowney #undef __FUNCT__
431bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
432bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
433bda325fcSPaul Mullowney {
434bda325fcSPaul Mullowney   PetscErrorCode     ierr;
435bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
436bda325fcSPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
437bda325fcSPaul Mullowney   cusparseStatus_t stat;
438bda325fcSPaul Mullowney   PetscFunctionBegin;
439bda325fcSPaul Mullowney   stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat);
440bda325fcSPaul Mullowney   ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr);
441bda325fcSPaul Mullowney   PetscFunctionReturn(0);
442bda325fcSPaul Mullowney }
443bda325fcSPaul Mullowney 
444bda325fcSPaul Mullowney #undef __FUNCT__
445bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
446bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
447bda325fcSPaul Mullowney {
448bda325fcSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
449bda325fcSPaul Mullowney   PetscErrorCode ierr;
450bda325fcSPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
451bda325fcSPaul Mullowney   cusparseStatus_t stat;
452bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
453bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
454bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
455bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
456bda325fcSPaul Mullowney 
457bda325fcSPaul Mullowney   PetscFunctionBegin;
458bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
459bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
460bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
461bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
462bda325fcSPaul Mullowney   }
463bda325fcSPaul Mullowney 
464bda325fcSPaul Mullowney   /* Get the GPU pointers */
465bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
466bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
467bda325fcSPaul Mullowney 
468bda325fcSPaul Mullowney   /* solve with reordering */
469bda325fcSPaul Mullowney   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
470bda325fcSPaul Mullowney   stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
471bda325fcSPaul Mullowney   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
472bda325fcSPaul Mullowney   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
473bda325fcSPaul Mullowney 
474bda325fcSPaul Mullowney   /* restore */
475bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
476bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
477bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
478bda325fcSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
479bda325fcSPaul Mullowney   PetscFunctionReturn(0);
480bda325fcSPaul Mullowney }
481bda325fcSPaul Mullowney 
482bda325fcSPaul Mullowney #undef __FUNCT__
483bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
484bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
485bda325fcSPaul Mullowney {
486bda325fcSPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
487bda325fcSPaul Mullowney   PetscErrorCode    ierr;
488bda325fcSPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
489bda325fcSPaul Mullowney   cusparseStatus_t stat;
490bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
491bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
492bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
493bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
494bda325fcSPaul Mullowney 
495bda325fcSPaul Mullowney   PetscFunctionBegin;
496bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
497bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
498bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
499bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
500bda325fcSPaul Mullowney   }
501bda325fcSPaul Mullowney 
502bda325fcSPaul Mullowney   /* Get the GPU pointers */
503bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
504bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
505bda325fcSPaul Mullowney 
506bda325fcSPaul Mullowney   /* solve */
507bda325fcSPaul Mullowney   stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
508bda325fcSPaul Mullowney   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
509bda325fcSPaul Mullowney 
510bda325fcSPaul Mullowney   /* restore */
511bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
512bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
513bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
514bda325fcSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
515bda325fcSPaul Mullowney   PetscFunctionReturn(0);
516bda325fcSPaul Mullowney }
517bda325fcSPaul Mullowney 
5189ae82921SPaul Mullowney 
5199ae82921SPaul Mullowney #undef __FUNCT__
5209ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
5219ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
5229ae82921SPaul Mullowney {
5239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
5249ae82921SPaul Mullowney   PetscErrorCode               ierr;
525bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
5269ae82921SPaul Mullowney   cusparseStatus_t             stat;
5279ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5289ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
5299ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
5309ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
5319ae82921SPaul Mullowney 
5329ae82921SPaul Mullowney   PetscFunctionBegin;
533e057df02SPaul Mullowney   /* Get the GPU pointers */
5349ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5359ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
5369ae82921SPaul Mullowney 
537e057df02SPaul Mullowney   /* solve with reordering */
5389ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
5399ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
5409ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
5419ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
5429ae82921SPaul Mullowney 
5439ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
5449ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5459ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
5469ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5479ae82921SPaul Mullowney   PetscFunctionReturn(0);
5489ae82921SPaul Mullowney }
5499ae82921SPaul Mullowney 
5509ae82921SPaul Mullowney 
5519ae82921SPaul Mullowney 
5529ae82921SPaul Mullowney 
5539ae82921SPaul Mullowney #undef __FUNCT__
5549ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
5559ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
5569ae82921SPaul Mullowney {
5579ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
5589ae82921SPaul Mullowney   PetscErrorCode               ierr;
559bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
5609ae82921SPaul Mullowney   cusparseStatus_t             stat;
5619ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5629ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
5639ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
5649ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
5659ae82921SPaul Mullowney 
5669ae82921SPaul Mullowney   PetscFunctionBegin;
567e057df02SPaul Mullowney   /* Get the GPU pointers */
5689ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5699ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
5709ae82921SPaul Mullowney 
571e057df02SPaul Mullowney   /* solve */
5729ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
5739ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
5749ae82921SPaul Mullowney 
5759ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
5769ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5779ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
5789ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5799ae82921SPaul Mullowney   PetscFunctionReturn(0);
5809ae82921SPaul Mullowney }
5819ae82921SPaul Mullowney 
5829ae82921SPaul Mullowney #undef __FUNCT__
583e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
584e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
5859ae82921SPaul Mullowney {
5869ae82921SPaul Mullowney 
5879ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
5889ae82921SPaul Mullowney   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
5899ae82921SPaul Mullowney   PetscInt           m            = A->rmap->n,*ii,*ridx;
5909ae82921SPaul Mullowney   PetscErrorCode     ierr;
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney 
5939ae82921SPaul Mullowney   PetscFunctionBegin;
5949ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
5959ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5969ae82921SPaul Mullowney     /*
5979ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
5989ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
5999ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
6009ae82921SPaul Mullowney     */
6019ae82921SPaul Mullowney     if (cusparseMat->mat) {
6029ae82921SPaul Mullowney       try {
6039ae82921SPaul Mullowney         delete cusparseMat->mat;
6042205254eSKarl Rupp         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
6059ae82921SPaul Mullowney 
6069ae82921SPaul Mullowney       } catch(char *ex) {
6079ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6089ae82921SPaul Mullowney       }
6099ae82921SPaul Mullowney     }
6109ae82921SPaul Mullowney     try {
6119ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
6122205254eSKarl Rupp       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
6139ae82921SPaul Mullowney 
6149ae82921SPaul Mullowney       if (a->compressedrow.use) {
6159ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
6169ae82921SPaul Mullowney         ii   = a->compressedrow.i;
6179ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
6189ae82921SPaul Mullowney       } else {
619e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
6209ae82921SPaul Mullowney         int k=0;
6219ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
6229ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
6239ae82921SPaul Mullowney         ii[0]=0;
6249ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
6259ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
6269ae82921SPaul Mullowney             ii[k]  = a->i[j];
6279ae82921SPaul Mullowney             ridx[k]= j;
6289ae82921SPaul Mullowney             k++;
6299ae82921SPaul Mullowney           }
6309ae82921SPaul Mullowney         }
6319ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
6322205254eSKarl Rupp 
6339ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
6349ae82921SPaul Mullowney       }
6359ae82921SPaul Mullowney 
636e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
637e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
6389ae82921SPaul Mullowney 
639e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
6409ae82921SPaul Mullowney       PetscMPIInt size;
6419ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
6429ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
6439ae82921SPaul Mullowney 
644e057df02SPaul Mullowney       /* FILL MODE UPPER is irrelevant */
645bda325fcSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
646ca45077fSPaul Mullowney 
647e057df02SPaul Mullowney       /* lastly, build the matrix */
6489ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
6499ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
6509ae82921SPaul Mullowney       if (!a->compressedrow.use) {
6519ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
6529ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
6539ae82921SPaul Mullowney       }
6549ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
6559ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
6569ae82921SPaul Mullowney     } catch(char *ex) {
6579ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6589ae82921SPaul Mullowney     }
6599ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
6602205254eSKarl Rupp 
6619ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
6622205254eSKarl Rupp 
6639ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
6649ae82921SPaul Mullowney   }
6659ae82921SPaul Mullowney   PetscFunctionReturn(0);
6669ae82921SPaul Mullowney }
6679ae82921SPaul Mullowney 
6689ae82921SPaul Mullowney #undef __FUNCT__
6699ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
6709ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
6719ae82921SPaul Mullowney {
6729ae82921SPaul Mullowney   PetscErrorCode ierr;
6739ae82921SPaul Mullowney 
6749ae82921SPaul Mullowney   PetscFunctionBegin;
6759ae82921SPaul Mullowney   if (right) {
676ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
6779ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6789ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
6799ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
6809ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
6819ae82921SPaul Mullowney   }
6829ae82921SPaul Mullowney   if (left) {
683ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
6849ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6859ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
6869ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
6879ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
6889ae82921SPaul Mullowney   }
6899ae82921SPaul Mullowney   PetscFunctionReturn(0);
6909ae82921SPaul Mullowney }
6919ae82921SPaul Mullowney 
6929ae82921SPaul Mullowney #undef __FUNCT__
6939ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
6949ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
6959ae82921SPaul Mullowney {
6969ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
6979ae82921SPaul Mullowney   PetscErrorCode     ierr;
6989ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
699bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
7009ae82921SPaul Mullowney 
7019ae82921SPaul Mullowney   PetscFunctionBegin;
702e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
703e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
7049ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7059ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
7069ae82921SPaul Mullowney   try {
7079ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
7089ae82921SPaul Mullowney   } catch (char *ex) {
7099ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7109ae82921SPaul Mullowney   }
7119ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7129ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
713ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
7149ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
715ca45077fSPaul Mullowney   }
7169ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
7179ae82921SPaul Mullowney   PetscFunctionReturn(0);
7189ae82921SPaul Mullowney }
7199ae82921SPaul Mullowney 
7209ae82921SPaul Mullowney 
7219ae82921SPaul Mullowney #undef __FUNCT__
722ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
723ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
724ca45077fSPaul Mullowney {
725ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
726ca45077fSPaul Mullowney   PetscErrorCode     ierr;
727ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
728bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
729ca45077fSPaul Mullowney 
730ca45077fSPaul Mullowney   PetscFunctionBegin;
731e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
732e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
733bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
734bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
735bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
736bda325fcSPaul Mullowney   }
737ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
738ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
739ca45077fSPaul Mullowney   try {
740ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
741ca45077fSPaul Mullowney   } catch (char *ex) {
742ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
743ca45077fSPaul Mullowney   }
744ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
745ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
746ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
747ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
748ca45077fSPaul Mullowney   }
749ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
750ca45077fSPaul Mullowney   PetscFunctionReturn(0);
751ca45077fSPaul Mullowney }
752ca45077fSPaul Mullowney 
753ca45077fSPaul Mullowney #undef __FUNCT__
7549ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
7559ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
7569ae82921SPaul Mullowney {
7579ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
7589ae82921SPaul Mullowney   PetscErrorCode     ierr;
7599ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
760bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
7616e111a19SKarl Rupp 
7629ae82921SPaul Mullowney   PetscFunctionBegin;
763e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
764e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
7659ae82921SPaul Mullowney   try {
7669ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
7679ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7689ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
7699ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
7709ae82921SPaul Mullowney 
771e057df02SPaul Mullowney     /* multiply add */
7729ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
7739ae82921SPaul Mullowney 
7749ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7759ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
7769ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
7779ae82921SPaul Mullowney 
7789ae82921SPaul Mullowney   } catch(char *ex) {
7799ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7809ae82921SPaul Mullowney   }
7819ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
7829ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
7839ae82921SPaul Mullowney   PetscFunctionReturn(0);
7849ae82921SPaul Mullowney }
7859ae82921SPaul Mullowney 
7869ae82921SPaul Mullowney #undef __FUNCT__
787ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
788ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
789ca45077fSPaul Mullowney {
790ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
791ca45077fSPaul Mullowney   PetscErrorCode     ierr;
792ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
793ca45077fSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
7946e111a19SKarl Rupp 
795ca45077fSPaul Mullowney   PetscFunctionBegin;
796e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
797e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
798bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
799bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
800bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
801bda325fcSPaul Mullowney   }
802ca45077fSPaul Mullowney   try {
803ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
804ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
805ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
806ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
807ca45077fSPaul Mullowney 
808e057df02SPaul Mullowney     /* multiply add with matrix transpose */
809ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
810ca45077fSPaul Mullowney 
811ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
812ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
813ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
814ca45077fSPaul Mullowney 
815ca45077fSPaul Mullowney   } catch(char *ex) {
816ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
817ca45077fSPaul Mullowney   }
818ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
819ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
820ca45077fSPaul Mullowney   PetscFunctionReturn(0);
821ca45077fSPaul Mullowney }
822ca45077fSPaul Mullowney 
823ca45077fSPaul Mullowney #undef __FUNCT__
8249ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
8259ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
8269ae82921SPaul Mullowney {
8279ae82921SPaul Mullowney   PetscErrorCode ierr;
8286e111a19SKarl Rupp 
8299ae82921SPaul Mullowney   PetscFunctionBegin;
8309ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
831e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8329ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
833bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
834bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
835bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
836bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
8379ae82921SPaul Mullowney   PetscFunctionReturn(0);
8389ae82921SPaul Mullowney }
8399ae82921SPaul Mullowney 
8409ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
8419ae82921SPaul Mullowney #undef __FUNCT__
8429ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
843e057df02SPaul Mullowney /*@
8449ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
845e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
846e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
847e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
848e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
849e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
8509ae82921SPaul Mullowney 
8519ae82921SPaul Mullowney    Collective on MPI_Comm
8529ae82921SPaul Mullowney 
8539ae82921SPaul Mullowney    Input Parameters:
8549ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
8559ae82921SPaul Mullowney .  m - number of rows
8569ae82921SPaul Mullowney .  n - number of columns
8579ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
8589ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
8590298fd71SBarry Smith          (possibly different for each row) or NULL
8609ae82921SPaul Mullowney 
8619ae82921SPaul Mullowney    Output Parameter:
8629ae82921SPaul Mullowney .  A - the matrix
8639ae82921SPaul Mullowney 
8649ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8659ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8669ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8679ae82921SPaul Mullowney 
8689ae82921SPaul Mullowney    Notes:
8699ae82921SPaul Mullowney    If nnz is given then nz is ignored
8709ae82921SPaul Mullowney 
8719ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
8729ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
8739ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
8749ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
8759ae82921SPaul Mullowney 
8769ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
8770298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
8789ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
8799ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
8809ae82921SPaul Mullowney 
8819ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
8829ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
8839ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
8849ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
8859ae82921SPaul Mullowney 
8869ae82921SPaul Mullowney    Level: intermediate
8879ae82921SPaul Mullowney 
888e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
8899ae82921SPaul Mullowney @*/
8909ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8919ae82921SPaul Mullowney {
8929ae82921SPaul Mullowney   PetscErrorCode ierr;
8939ae82921SPaul Mullowney 
8949ae82921SPaul Mullowney   PetscFunctionBegin;
8959ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
8969ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
8979ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8989ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
8999ae82921SPaul Mullowney   PetscFunctionReturn(0);
9009ae82921SPaul Mullowney }
9019ae82921SPaul Mullowney 
9029ae82921SPaul Mullowney #undef __FUNCT__
9039ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
9049ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
9059ae82921SPaul Mullowney {
9069ae82921SPaul Mullowney   PetscErrorCode     ierr;
9079ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
9089ae82921SPaul Mullowney 
9099ae82921SPaul Mullowney   PetscFunctionBegin;
9109ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
9119ae82921SPaul Mullowney     try {
9129ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
9139ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
9149ae82921SPaul Mullowney       }
9152205254eSKarl Rupp       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
9169ae82921SPaul Mullowney       delete cusparseMat;
9179ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9189ae82921SPaul Mullowney     } catch(char *ex) {
9199ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9209ae82921SPaul Mullowney     }
9219ae82921SPaul Mullowney   } else {
922e057df02SPaul Mullowney     /* The triangular factors */
9239ae82921SPaul Mullowney     try {
9249ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
9259ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
9269ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
9279ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatLo;
9289ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatUp;
9299ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
9309ae82921SPaul Mullowney       delete cusparseTriFactors;
9319ae82921SPaul Mullowney     } catch(char *ex) {
9329ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9339ae82921SPaul Mullowney     }
9349ae82921SPaul Mullowney   }
9359ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
9369ae82921SPaul Mullowney     cusparseStatus_t stat;
9379ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
9382205254eSKarl Rupp 
9399ae82921SPaul Mullowney     MAT_cusparseHandle=0;
9409ae82921SPaul Mullowney   }
9419ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
9429ae82921SPaul Mullowney   A->spptr = 0;
9439ae82921SPaul Mullowney 
9449ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
9459ae82921SPaul Mullowney   PetscFunctionReturn(0);
9469ae82921SPaul Mullowney }
9479ae82921SPaul Mullowney 
9489ae82921SPaul Mullowney #undef __FUNCT__
9499ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
950*8cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
9519ae82921SPaul Mullowney {
9529ae82921SPaul Mullowney   PetscErrorCode ierr;
9539ae82921SPaul Mullowney 
9549ae82921SPaul Mullowney   PetscFunctionBegin;
9559ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
9569ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
957e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
958e057df02SPaul Mullowney        now build a GPU matrix data structure */
9599ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
9602205254eSKarl Rupp 
9619ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
9629ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
963e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
964bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
9659ae82921SPaul Mullowney   } else {
9669ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
967debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
9682205254eSKarl Rupp 
9699ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
9709ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
9719ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
972e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
973bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
9749ae82921SPaul Mullowney   }
975e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
9769ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
9779ae82921SPaul Mullowney     cusparseStatus_t stat;
9789ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
9799ae82921SPaul Mullowney   }
980e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
981e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
982e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
98300de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
9842205254eSKarl Rupp 
9859ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
9869ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
9879ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
9889ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
989ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
990ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
991ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
992ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
9932205254eSKarl Rupp 
9949ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
9952205254eSKarl Rupp 
9969ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9972205254eSKarl Rupp 
99800de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
9999ae82921SPaul Mullowney   PetscFunctionReturn(0);
10009ae82921SPaul Mullowney }
10019ae82921SPaul Mullowney 
1002e057df02SPaul Mullowney /*M
1003e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1004e057df02SPaul Mullowney 
1005e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1006e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1007e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1008e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1009e057df02SPaul Mullowney    the different GPU storage formats.
1010e057df02SPaul Mullowney 
1011e057df02SPaul Mullowney    Options Database Keys:
1012e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
10138468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
10148468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
10158468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1016e057df02SPaul Mullowney 
1017e057df02SPaul Mullowney   Level: beginner
1018e057df02SPaul Mullowney 
10198468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1020e057df02SPaul Mullowney M*/
1021