xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 087f32624287cccaebf63b858eabc4ad5aaff02a)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9*087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
10*087f3262SPaul Mullowney 
119ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
129ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
139ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
149ae82921SPaul Mullowney #undef VecType
159ae82921SPaul Mullowney #include "cusparsematimpl.h"
16e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
179ae82921SPaul Mullowney 
18e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
19e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
20e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
21e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
22e057df02SPaul Mullowney    and reuse it. */
23ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
249ae82921SPaul Mullowney 
25*087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
26*087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
27*087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
28*087f3262SPaul Mullowney 
299ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
309ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
319ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
32*087f3262SPaul Mullowney 
339ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
349ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
35bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
36bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
379ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
388dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
398dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
408dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
418dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
429ae82921SPaul Mullowney 
439ae82921SPaul Mullowney #undef __FUNCT__
449ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
459ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
469ae82921SPaul Mullowney {
479ae82921SPaul Mullowney   PetscFunctionBegin;
489ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
499ae82921SPaul Mullowney   PetscFunctionReturn(0);
509ae82921SPaul Mullowney }
519ae82921SPaul Mullowney 
5274c3e148SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
53b2573a8aSBarry Smith 
54*087f3262SPaul Mullowney 
55c708e6cdSJed Brown /*MC
56*087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
57*087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
58*087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
59*087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
60*087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
61*087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
62c708e6cdSJed Brown 
63c708e6cdSJed Brown   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
64c708e6cdSJed Brown 
65*087f3262SPaul Mullowney   Consult CUSPARSE documentation for more information about the matrix storage formats
66*087f3262SPaul Mullowney   which correspond to the options database keys below.
67c708e6cdSJed Brown 
68c708e6cdSJed Brown    Options Database Keys:
69*087f3262SPaul Mullowney .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
709ae82921SPaul Mullowney 
719ae82921SPaul Mullowney   Level: beginner
72c708e6cdSJed Brown 
73c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
74c708e6cdSJed Brown M*/
759ae82921SPaul Mullowney 
769ae82921SPaul Mullowney #undef __FUNCT__
779ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
783c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
799ae82921SPaul Mullowney {
809ae82921SPaul Mullowney   PetscErrorCode ierr;
819ae82921SPaul Mullowney 
829ae82921SPaul Mullowney   PetscFunctionBegin;
839ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
849ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
859ae82921SPaul Mullowney   ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
86*087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
879ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
889ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
89*087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
90*087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
91*087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
929ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
93*087f3262SPaul Mullowney   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
949ae82921SPaul Mullowney   (*B)->factortype = ftype;
959ae82921SPaul Mullowney   PetscFunctionReturn(0);
969ae82921SPaul Mullowney }
979ae82921SPaul Mullowney 
989ae82921SPaul Mullowney #undef __FUNCT__
99e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
100e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
101ca45077fSPaul Mullowney {
102ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1036e111a19SKarl Rupp 
104ca45077fSPaul Mullowney   PetscFunctionBegin;
105ca45077fSPaul Mullowney   switch (op) {
106e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
107e057df02SPaul Mullowney     cusparseMat->format = format;
108ca45077fSPaul Mullowney     break;
109e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
110e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
111ca45077fSPaul Mullowney     break;
112e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
113e057df02SPaul Mullowney     cusparseMat->format           = format;
114e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
115ca45077fSPaul Mullowney     break;
116ca45077fSPaul Mullowney   default:
117e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
118ca45077fSPaul Mullowney   }
119ca45077fSPaul Mullowney   PetscFunctionReturn(0);
120ca45077fSPaul Mullowney }
1219ae82921SPaul Mullowney 
122e057df02SPaul Mullowney /*@
123e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
124e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
1258468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
126e057df02SPaul Mullowney    to build/install PETSc to use this package.
127e057df02SPaul Mullowney 
128e057df02SPaul Mullowney    Not Collective
129e057df02SPaul Mullowney 
130e057df02SPaul Mullowney    Input Parameters:
1318468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1328468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1338468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
134e057df02SPaul Mullowney 
135e057df02SPaul Mullowney    Output Parameter:
136e057df02SPaul Mullowney 
137e057df02SPaul Mullowney    Level: intermediate
138e057df02SPaul Mullowney 
1398468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
140e057df02SPaul Mullowney @*/
141e057df02SPaul Mullowney #undef __FUNCT__
142e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
143e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144e057df02SPaul Mullowney {
145e057df02SPaul Mullowney   PetscErrorCode ierr;
1466e111a19SKarl Rupp 
147e057df02SPaul Mullowney   PetscFunctionBegin;
148e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
149e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
150e057df02SPaul Mullowney   PetscFunctionReturn(0);
151e057df02SPaul Mullowney }
152e057df02SPaul Mullowney 
1539ae82921SPaul Mullowney #undef __FUNCT__
1549ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1559ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1569ae82921SPaul Mullowney {
1579ae82921SPaul Mullowney   PetscErrorCode           ierr;
158e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1599ae82921SPaul Mullowney   PetscBool                flg;
1606e111a19SKarl Rupp 
1619ae82921SPaul Mullowney   PetscFunctionBegin;
162e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
163e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1649ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
165e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1667083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
167e057df02SPaul Mullowney     if (flg) {
168e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
169045c96e1SPaul Mullowney     }
1702205254eSKarl Rupp   } else {
171e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1727083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173e057df02SPaul Mullowney     if (flg) {
174e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
175e057df02SPaul Mullowney     }
1769ae82921SPaul Mullowney   }
1774c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1787083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1794c87dfd4SPaul Mullowney   if (flg) {
1804c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1814c87dfd4SPaul Mullowney   }
1829ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1839ae82921SPaul Mullowney   PetscFunctionReturn(0);
1849ae82921SPaul Mullowney 
1859ae82921SPaul Mullowney }
1869ae82921SPaul Mullowney 
1879ae82921SPaul Mullowney #undef __FUNCT__
1889ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1899ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1909ae82921SPaul Mullowney {
1919ae82921SPaul Mullowney   PetscErrorCode ierr;
1929ae82921SPaul Mullowney 
1939ae82921SPaul Mullowney   PetscFunctionBegin;
1949ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1959ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1969ae82921SPaul Mullowney   PetscFunctionReturn(0);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
1999ae82921SPaul Mullowney #undef __FUNCT__
2009ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
2019ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2029ae82921SPaul Mullowney {
2039ae82921SPaul Mullowney   PetscErrorCode ierr;
2049ae82921SPaul Mullowney 
2059ae82921SPaul Mullowney   PetscFunctionBegin;
2069ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2079ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2089ae82921SPaul Mullowney   PetscFunctionReturn(0);
2099ae82921SPaul Mullowney }
2109ae82921SPaul Mullowney 
2119ae82921SPaul Mullowney #undef __FUNCT__
212*087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
213*087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214*087f3262SPaul Mullowney {
215*087f3262SPaul Mullowney   PetscErrorCode ierr;
216*087f3262SPaul Mullowney 
217*087f3262SPaul Mullowney   PetscFunctionBegin;
218*087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219*087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220*087f3262SPaul Mullowney   PetscFunctionReturn(0);
221*087f3262SPaul Mullowney }
222*087f3262SPaul Mullowney 
223*087f3262SPaul Mullowney #undef __FUNCT__
224*087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
225*087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226*087f3262SPaul Mullowney {
227*087f3262SPaul Mullowney   PetscErrorCode ierr;
228*087f3262SPaul Mullowney 
229*087f3262SPaul Mullowney   PetscFunctionBegin;
230*087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231*087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232*087f3262SPaul Mullowney   PetscFunctionReturn(0);
233*087f3262SPaul Mullowney }
234*087f3262SPaul Mullowney 
235*087f3262SPaul Mullowney #undef __FUNCT__
236*087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
237*087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2389ae82921SPaul Mullowney {
2399ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2409ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2419ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2429ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2439ae82921SPaul Mullowney   cusparseStatus_t             stat;
2449ae82921SPaul Mullowney   const PetscInt               *ai = a->i,*aj = a->j,*vi;
2459ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2469ae82921SPaul Mullowney   PetscErrorCode               ierr;
2479ae82921SPaul Mullowney   PetscInt                     *AiLo, *AjLo;
2489ae82921SPaul Mullowney   PetscScalar                  *AALo;
2499ae82921SPaul Mullowney   PetscInt                     i,nz, nzLower, offset, rowOffset;
2509ae82921SPaul Mullowney 
2519ae82921SPaul Mullowney   PetscFunctionBegin;
2529ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2539ae82921SPaul Mullowney     try {
2549ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2559ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2569ae82921SPaul Mullowney 
2579ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2589ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2599ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2609ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2619ae82921SPaul Mullowney 
2629ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2639ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2649ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2659ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2669ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2679ae82921SPaul Mullowney       v        = aa;
2689ae82921SPaul Mullowney       vi       = aj;
2699ae82921SPaul Mullowney       offset   = 1;
2709ae82921SPaul Mullowney       rowOffset= 1;
2719ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2729ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
273e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2749ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2759ae82921SPaul Mullowney         rowOffset += nz+1;
2769ae82921SPaul Mullowney 
2779ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2789ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2799ae82921SPaul Mullowney 
2809ae82921SPaul Mullowney         offset      += nz;
2819ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2829ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2839ae82921SPaul Mullowney         offset      += 1;
2849ae82921SPaul Mullowney 
2859ae82921SPaul Mullowney         v  += nz;
2869ae82921SPaul Mullowney         vi += nz;
2879ae82921SPaul Mullowney       }
288e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2892205254eSKarl Rupp 
290*087f3262SPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
2919ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2929ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2932205254eSKarl Rupp 
2949ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2952205254eSKarl Rupp 
2969ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2979ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2989ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2999ae82921SPaul Mullowney     } catch(char *ex) {
3009ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3019ae82921SPaul Mullowney     }
3029ae82921SPaul Mullowney   }
3039ae82921SPaul Mullowney   PetscFunctionReturn(0);
3049ae82921SPaul Mullowney }
3059ae82921SPaul Mullowney 
3069ae82921SPaul Mullowney #undef __FUNCT__
307*087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
308*087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3099ae82921SPaul Mullowney {
3109ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3119ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
3129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3139ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
3149ae82921SPaul Mullowney   cusparseStatus_t             stat;
3159ae82921SPaul Mullowney   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
3169ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
3179ae82921SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
3189ae82921SPaul Mullowney   PetscScalar                  *AAUp;
3199ae82921SPaul Mullowney   PetscInt                     i,nz, nzUpper, offset;
3209ae82921SPaul Mullowney   PetscErrorCode               ierr;
3219ae82921SPaul Mullowney 
3229ae82921SPaul Mullowney   PetscFunctionBegin;
3239ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
3249ae82921SPaul Mullowney     try {
3259ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3269ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3299ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3309ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3319ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3329ae82921SPaul Mullowney 
3339ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3349ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3359ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3369ae82921SPaul Mullowney       offset = nzUpper;
3379ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3389ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3399ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3409ae82921SPaul Mullowney 
341e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3429ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3439ae82921SPaul Mullowney 
344e057df02SPaul Mullowney         /* decrement the offset */
3459ae82921SPaul Mullowney         offset -= (nz+1);
3469ae82921SPaul Mullowney 
347e057df02SPaul Mullowney         /* first, set the diagonal elements */
3489ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3499ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3509ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3519ae82921SPaul Mullowney 
3529ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3539ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3549ae82921SPaul Mullowney       }
355e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3562205254eSKarl Rupp 
357*087f3262SPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
3589ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3599ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3602205254eSKarl Rupp 
3619ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3622205254eSKarl Rupp 
3639ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3649ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3659ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3669ae82921SPaul Mullowney     } catch(char *ex) {
3679ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3689ae82921SPaul Mullowney     }
3699ae82921SPaul Mullowney   }
3709ae82921SPaul Mullowney   PetscFunctionReturn(0);
3719ae82921SPaul Mullowney }
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney #undef __FUNCT__
374*087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
375*087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
3769ae82921SPaul Mullowney {
3779ae82921SPaul Mullowney   PetscErrorCode               ierr;
3789ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3799ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3809ae82921SPaul Mullowney   IS                           isrow               = a->row,iscol = a->icol;
3819ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
3829ae82921SPaul Mullowney   const PetscInt               *r,*c;
3839ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
3849ae82921SPaul Mullowney 
3859ae82921SPaul Mullowney   PetscFunctionBegin;
386*087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
387*087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
3882205254eSKarl Rupp 
3899ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3909ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3919ae82921SPaul Mullowney 
3929ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
393e057df02SPaul Mullowney   /*lower triangular indices */
3949ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3959ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3962205254eSKarl Rupp   if (!row_identity) {
3979ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3982205254eSKarl Rupp   }
3999ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4009ae82921SPaul Mullowney 
401e057df02SPaul Mullowney   /*upper triangular indices */
4029ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
4039ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4042205254eSKarl Rupp   if (!col_identity) {
4059ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
4062205254eSKarl Rupp   }
4079ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
4089ae82921SPaul Mullowney   PetscFunctionReturn(0);
4099ae82921SPaul Mullowney }
4109ae82921SPaul Mullowney 
4119ae82921SPaul Mullowney #undef __FUNCT__
412*087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
413*087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
414*087f3262SPaul Mullowney {
415*087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
416*087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417*087f3262SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMatLo       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418*087f3262SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMatUp       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419*087f3262SPaul Mullowney   cusparseStatus_t             stat;
420*087f3262SPaul Mullowney   PetscErrorCode               ierr;
421*087f3262SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
422*087f3262SPaul Mullowney   PetscScalar                  *AAUp;
423*087f3262SPaul Mullowney   PetscScalar                  *AALo;
424*087f3262SPaul Mullowney   PetscInt          nzUpper=a->nz,n=A->rmap->n,i,offset,nz,j;
425*087f3262SPaul Mullowney   Mat_SeqSBAIJ      *b = (Mat_SeqSBAIJ*)A->data;
426*087f3262SPaul Mullowney   const PetscInt    *ai=b->i,*aj=b->j,*vj;
427*087f3262SPaul Mullowney   const MatScalar   *aa=b->a,*v;
428*087f3262SPaul Mullowney 
429*087f3262SPaul Mullowney   PetscFunctionBegin;
430*087f3262SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
431*087f3262SPaul Mullowney     try {
432*087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
433*087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
434*087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
435*087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
436*087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
437*087f3262SPaul Mullowney 
438*087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
439*087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
440*087f3262SPaul Mullowney       AiUp[n]=nzUpper;
441*087f3262SPaul Mullowney       offset = 0;
442*087f3262SPaul Mullowney       for (i=0; i<n; i++) {
443*087f3262SPaul Mullowney         /* set the pointers */
444*087f3262SPaul Mullowney         v  = aa + ai[i];
445*087f3262SPaul Mullowney         vj = aj + ai[i];
446*087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
447*087f3262SPaul Mullowney 
448*087f3262SPaul Mullowney         /* first, set the diagonal elements */
449*087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
450*087f3262SPaul Mullowney         AAUp[offset] = 1.0/v[nz];
451*087f3262SPaul Mullowney         AiUp[i]      = offset;
452*087f3262SPaul Mullowney         AALo[offset] = 1.0/v[nz];
453*087f3262SPaul Mullowney 
454*087f3262SPaul Mullowney         offset+=1;
455*087f3262SPaul Mullowney         if (nz>0) {
456*087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
457*087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
458*087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
459*087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
460*087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
461*087f3262SPaul Mullowney           }
462*087f3262SPaul Mullowney           offset+=nz;
463*087f3262SPaul Mullowney         }
464*087f3262SPaul Mullowney       }
465*087f3262SPaul Mullowney 
466*087f3262SPaul Mullowney       /* Build the upper triangular piece */
467*087f3262SPaul Mullowney       cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
468*087f3262SPaul Mullowney       stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
469*087f3262SPaul Mullowney       ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
470*087f3262SPaul Mullowney       stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat);
471*087f3262SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp;
472*087f3262SPaul Mullowney 
473*087f3262SPaul Mullowney       /* Build the lower triangular piece */
474*087f3262SPaul Mullowney       cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
475*087f3262SPaul Mullowney       stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
476*087f3262SPaul Mullowney       ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr);
477*087f3262SPaul Mullowney       stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat);
478*087f3262SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo;
479*087f3262SPaul Mullowney 
480*087f3262SPaul Mullowney       /* set this flag ... for performance logging */
481*087f3262SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE;
482*087f3262SPaul Mullowney 
483*087f3262SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484*087f3262SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
485*087f3262SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
486*087f3262SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
487*087f3262SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
488*087f3262SPaul Mullowney     } catch(char *ex) {
489*087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490*087f3262SPaul Mullowney     }
491*087f3262SPaul Mullowney   }
492*087f3262SPaul Mullowney   PetscFunctionReturn(0);
493*087f3262SPaul Mullowney }
494*087f3262SPaul Mullowney 
495*087f3262SPaul Mullowney #undef __FUNCT__
496*087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
497*087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498*087f3262SPaul Mullowney {
499*087f3262SPaul Mullowney   PetscErrorCode               ierr;
500*087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
501*087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502*087f3262SPaul Mullowney   IS             ip=a->row;
503*087f3262SPaul Mullowney   const PetscInt *rip;
504*087f3262SPaul Mullowney   PetscBool      perm_identity;
505*087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
506*087f3262SPaul Mullowney 
507*087f3262SPaul Mullowney   PetscFunctionBegin;
508*087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
509*087f3262SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
510*087f3262SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
511*087f3262SPaul Mullowney   /*lower triangular indices */
512*087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
513*087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
514*087f3262SPaul Mullowney   if (!perm_identity) {
515*087f3262SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
516*087f3262SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
517*087f3262SPaul Mullowney   }
518*087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
519*087f3262SPaul Mullowney   PetscFunctionReturn(0);
520*087f3262SPaul Mullowney }
521*087f3262SPaul Mullowney 
522*087f3262SPaul Mullowney #undef __FUNCT__
5239ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
5249ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
5259ae82921SPaul Mullowney {
5269ae82921SPaul Mullowney   PetscErrorCode ierr;
5279ae82921SPaul Mullowney   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
5289ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
5299ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
5309ae82921SPaul Mullowney 
5319ae82921SPaul Mullowney   PetscFunctionBegin;
5329ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
533e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
5349ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5359ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
536bda325fcSPaul Mullowney   if (row_identity && col_identity) {
537bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
538bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
539bda325fcSPaul Mullowney   } else {
540bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
541bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
542bda325fcSPaul Mullowney   }
5438dc1d2a3SPaul Mullowney 
544e057df02SPaul Mullowney   /* get the triangular factors */
545*087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
5469ae82921SPaul Mullowney   PetscFunctionReturn(0);
5479ae82921SPaul Mullowney }
5489ae82921SPaul Mullowney 
549*087f3262SPaul Mullowney #undef __FUNCT__
550*087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
551*087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
552*087f3262SPaul Mullowney {
553*087f3262SPaul Mullowney   PetscErrorCode ierr;
554*087f3262SPaul Mullowney   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
555*087f3262SPaul Mullowney   IS             ip=b->row;
556*087f3262SPaul Mullowney   PetscBool      perm_identity;
557*087f3262SPaul Mullowney 
558*087f3262SPaul Mullowney   PetscFunctionBegin;
559*087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
560*087f3262SPaul Mullowney 
561*087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
562*087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
563*087f3262SPaul Mullowney   if (perm_identity) {
564*087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
565*087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
566*087f3262SPaul Mullowney   } else {
567*087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
568*087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
569*087f3262SPaul Mullowney   }
570*087f3262SPaul Mullowney 
571*087f3262SPaul Mullowney   /* get the triangular factors */
572*087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
573*087f3262SPaul Mullowney   PetscFunctionReturn(0);
574*087f3262SPaul Mullowney }
5759ae82921SPaul Mullowney 
576bda325fcSPaul Mullowney #undef __FUNCT__
577bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
578bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
579bda325fcSPaul Mullowney {
580bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
581bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
582bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
583bda325fcSPaul Mullowney   cusparseStatus_t stat;
584bda325fcSPaul Mullowney   PetscFunctionBegin;
585*087f3262SPaul Mullowney   stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle,
586bda325fcSPaul Mullowney                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
587bda325fcSPaul Mullowney                                                        CUSPARSE_FILL_MODE_UPPER,
588*087f3262SPaul Mullowney                                                        CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
589*087f3262SPaul Mullowney   stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat);
590bda325fcSPaul Mullowney 
591*087f3262SPaul Mullowney   stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle,
592bda325fcSPaul Mullowney                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
593bda325fcSPaul Mullowney                                                        CUSPARSE_FILL_MODE_LOWER,
594*087f3262SPaul Mullowney                                                        CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
595*087f3262SPaul Mullowney   stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat);
596bda325fcSPaul Mullowney   PetscFunctionReturn(0);
597bda325fcSPaul Mullowney }
598bda325fcSPaul Mullowney 
599bda325fcSPaul Mullowney #undef __FUNCT__
600bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
601bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
602bda325fcSPaul Mullowney {
603bda325fcSPaul Mullowney   PetscErrorCode     ierr;
604bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
605bda325fcSPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
606bda325fcSPaul Mullowney   cusparseStatus_t stat;
607bda325fcSPaul Mullowney   PetscFunctionBegin;
608*087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
609*087f3262SPaul Mullowney     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
610*087f3262SPaul Mullowney   } else {
611*087f3262SPaul Mullowney     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
612*087f3262SPaul Mullowney   }
613*087f3262SPaul Mullowney   ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr);
614bda325fcSPaul Mullowney   PetscFunctionReturn(0);
615bda325fcSPaul Mullowney }
616bda325fcSPaul Mullowney 
617bda325fcSPaul Mullowney #undef __FUNCT__
618bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
619bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
620bda325fcSPaul Mullowney {
621bda325fcSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
622bda325fcSPaul Mullowney   PetscErrorCode ierr;
623bda325fcSPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
624bda325fcSPaul Mullowney   cusparseStatus_t stat;
625bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
626bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
627bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
628bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
629bda325fcSPaul Mullowney 
630bda325fcSPaul Mullowney   PetscFunctionBegin;
631bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
632bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
633bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
634bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
635bda325fcSPaul Mullowney   }
636bda325fcSPaul Mullowney 
637bda325fcSPaul Mullowney   /* Get the GPU pointers */
638bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
639bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
640bda325fcSPaul Mullowney 
641bda325fcSPaul Mullowney   /* solve with reordering */
642bda325fcSPaul Mullowney   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
643*087f3262SPaul Mullowney   stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat);
644*087f3262SPaul Mullowney   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
645bda325fcSPaul Mullowney   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
646bda325fcSPaul Mullowney 
647bda325fcSPaul Mullowney   /* restore */
648bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
649bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
650bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
651*087f3262SPaul Mullowney 
652*087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
653*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
654*087f3262SPaul Mullowney   } else {
655bda325fcSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
656*087f3262SPaul Mullowney   }
657bda325fcSPaul Mullowney   PetscFunctionReturn(0);
658bda325fcSPaul Mullowney }
659bda325fcSPaul Mullowney 
660bda325fcSPaul Mullowney #undef __FUNCT__
661bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
662bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
663bda325fcSPaul Mullowney {
664bda325fcSPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
665bda325fcSPaul Mullowney   PetscErrorCode    ierr;
666bda325fcSPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
667bda325fcSPaul Mullowney   cusparseStatus_t stat;
668bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
669bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
670bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
671bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
672bda325fcSPaul Mullowney 
673bda325fcSPaul Mullowney   PetscFunctionBegin;
674bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
675bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
676bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
677bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
678bda325fcSPaul Mullowney   }
679bda325fcSPaul Mullowney 
680bda325fcSPaul Mullowney   /* Get the GPU pointers */
681bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
682bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
683bda325fcSPaul Mullowney 
684bda325fcSPaul Mullowney   /* solve */
685*087f3262SPaul Mullowney   stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat);
686*087f3262SPaul Mullowney   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
687bda325fcSPaul Mullowney 
688bda325fcSPaul Mullowney   /* restore */
689bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
690bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
691bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
692*087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
693*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
694*087f3262SPaul Mullowney   } else {
695bda325fcSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
696*087f3262SPaul Mullowney   }
697bda325fcSPaul Mullowney   PetscFunctionReturn(0);
698bda325fcSPaul Mullowney }
699bda325fcSPaul Mullowney 
7009ae82921SPaul Mullowney 
7019ae82921SPaul Mullowney #undef __FUNCT__
7029ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
7039ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
7049ae82921SPaul Mullowney {
7059ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
7069ae82921SPaul Mullowney   PetscErrorCode               ierr;
707bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
7089ae82921SPaul Mullowney   cusparseStatus_t             stat;
7099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7109ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
7119ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
7129ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
7139ae82921SPaul Mullowney 
7149ae82921SPaul Mullowney   PetscFunctionBegin;
715e057df02SPaul Mullowney   /* Get the GPU pointers */
7169ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7179ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
7189ae82921SPaul Mullowney 
719e057df02SPaul Mullowney   /* solve with reordering */
7209ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
7219ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
7229ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
7239ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
7249ae82921SPaul Mullowney 
7259ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
7269ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7279ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
728*087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
729*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
730*087f3262SPaul Mullowney   } else {
7319ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
732*087f3262SPaul Mullowney   }
7339ae82921SPaul Mullowney   PetscFunctionReturn(0);
7349ae82921SPaul Mullowney }
7359ae82921SPaul Mullowney 
7369ae82921SPaul Mullowney 
7379ae82921SPaul Mullowney 
7389ae82921SPaul Mullowney 
7399ae82921SPaul Mullowney #undef __FUNCT__
7409ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
7419ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
7429ae82921SPaul Mullowney {
7439ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
7449ae82921SPaul Mullowney   PetscErrorCode               ierr;
745bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
7469ae82921SPaul Mullowney   cusparseStatus_t             stat;
7479ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7489ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
7499ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
7509ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
7519ae82921SPaul Mullowney 
7529ae82921SPaul Mullowney   PetscFunctionBegin;
753e057df02SPaul Mullowney   /* Get the GPU pointers */
7549ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7559ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
7569ae82921SPaul Mullowney 
757e057df02SPaul Mullowney   /* solve */
7589ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
7599ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
7609ae82921SPaul Mullowney 
7619ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
7629ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7639ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
764*087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
765*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
766*087f3262SPaul Mullowney   } else {
7679ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
768*087f3262SPaul Mullowney   }
7699ae82921SPaul Mullowney   PetscFunctionReturn(0);
7709ae82921SPaul Mullowney }
7719ae82921SPaul Mullowney 
7729ae82921SPaul Mullowney #undef __FUNCT__
773e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
774e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
7759ae82921SPaul Mullowney {
7769ae82921SPaul Mullowney 
7779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
7789ae82921SPaul Mullowney   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
7799ae82921SPaul Mullowney   PetscInt           m            = A->rmap->n,*ii,*ridx;
7809ae82921SPaul Mullowney   PetscErrorCode     ierr;
781*087f3262SPaul Mullowney   PetscBool          symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE;
782*087f3262SPaul Mullowney   PetscBool          symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE;
7839ae82921SPaul Mullowney 
7849ae82921SPaul Mullowney   PetscFunctionBegin;
7859ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
7869ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
7879ae82921SPaul Mullowney     /*
7889ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
7899ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
7909ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
7919ae82921SPaul Mullowney     */
7929ae82921SPaul Mullowney     if (cusparseMat->mat) {
7939ae82921SPaul Mullowney       try {
7949ae82921SPaul Mullowney         delete cusparseMat->mat;
7952205254eSKarl Rupp         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
7969ae82921SPaul Mullowney 
7979ae82921SPaul Mullowney       } catch(char *ex) {
7989ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7999ae82921SPaul Mullowney       }
8009ae82921SPaul Mullowney     }
8019ae82921SPaul Mullowney     try {
8029ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
8032205254eSKarl Rupp       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
8049ae82921SPaul Mullowney 
8059ae82921SPaul Mullowney       if (a->compressedrow.use) {
8069ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
8079ae82921SPaul Mullowney         ii   = a->compressedrow.i;
8089ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
8099ae82921SPaul Mullowney       } else {
810e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
8119ae82921SPaul Mullowney         int k=0;
8129ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
8139ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
8149ae82921SPaul Mullowney         ii[0]=0;
8159ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
8169ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
8179ae82921SPaul Mullowney             ii[k]  = a->i[j];
8189ae82921SPaul Mullowney             ridx[k]= j;
8199ae82921SPaul Mullowney             k++;
8209ae82921SPaul Mullowney           }
8219ae82921SPaul Mullowney         }
8229ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
8232205254eSKarl Rupp 
8249ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
8259ae82921SPaul Mullowney       }
8269ae82921SPaul Mullowney 
827e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
828e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
8299ae82921SPaul Mullowney 
830e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
8319ae82921SPaul Mullowney       PetscMPIInt size;
8329ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
8339ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
8349ae82921SPaul Mullowney 
835*087f3262SPaul Mullowney       ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest);
836*087f3262SPaul Mullowney       if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) {
837*087f3262SPaul Mullowney 	/* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
838*087f3262SPaul Mullowney         cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
839*087f3262SPaul Mullowney         cusparseMat->isSymmOrHerm = PETSC_FALSE;
840*087f3262SPaul Mullowney       } else {
841*087f3262SPaul Mullowney         ierr = MatIsSymmetric(A,0.0,&symmetryTest);
842*087f3262SPaul Mullowney         if (symmetryTest) {
843*087f3262SPaul Mullowney           cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
844*087f3262SPaul Mullowney           cusparseMat->isSymmOrHerm = PETSC_TRUE;
845*087f3262SPaul Mullowney         } else {
846*087f3262SPaul Mullowney           ierr = MatIsHermitian(A,0.0,&hermitianTest);
847*087f3262SPaul Mullowney           if (hermitianTest) {
848*087f3262SPaul Mullowney             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
849*087f3262SPaul Mullowney             cusparseMat->isSymmOrHerm = PETSC_TRUE;
850*087f3262SPaul Mullowney           } else {
851*087f3262SPaul Mullowney             /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
852*087f3262SPaul Mullowney             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
853*087f3262SPaul Mullowney             cusparseMat->isSymmOrHerm = PETSC_FALSE;
854*087f3262SPaul Mullowney           }
855*087f3262SPaul Mullowney         }
856*087f3262SPaul Mullowney       }
857ca45077fSPaul Mullowney 
858e057df02SPaul Mullowney       /* lastly, build the matrix */
8599ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
8609ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
8619ae82921SPaul Mullowney       if (!a->compressedrow.use) {
8629ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
8639ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
8649ae82921SPaul Mullowney       }
8659ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
8669ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
8679ae82921SPaul Mullowney     } catch(char *ex) {
8689ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8699ae82921SPaul Mullowney     }
8709ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
8712205254eSKarl Rupp 
8729ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
8732205254eSKarl Rupp 
8749ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
8759ae82921SPaul Mullowney   }
8769ae82921SPaul Mullowney   PetscFunctionReturn(0);
8779ae82921SPaul Mullowney }
8789ae82921SPaul Mullowney 
8799ae82921SPaul Mullowney #undef __FUNCT__
8809ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
8819ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
8829ae82921SPaul Mullowney {
8839ae82921SPaul Mullowney   PetscErrorCode ierr;
8849ae82921SPaul Mullowney 
8859ae82921SPaul Mullowney   PetscFunctionBegin;
8869ae82921SPaul Mullowney   if (right) {
887ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
8889ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
8899ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
8909ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
8919ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
8929ae82921SPaul Mullowney   }
8939ae82921SPaul Mullowney   if (left) {
894ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
8959ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
8969ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
8979ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
8989ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
8999ae82921SPaul Mullowney   }
9009ae82921SPaul Mullowney   PetscFunctionReturn(0);
9019ae82921SPaul Mullowney }
9029ae82921SPaul Mullowney 
9039ae82921SPaul Mullowney #undef __FUNCT__
9049ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
9059ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
9069ae82921SPaul Mullowney {
9079ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
9089ae82921SPaul Mullowney   PetscErrorCode     ierr;
9099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
910bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
9119ae82921SPaul Mullowney 
9129ae82921SPaul Mullowney   PetscFunctionBegin;
913e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
914e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
9159ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
9169ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
9179ae82921SPaul Mullowney   try {
9189ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
9199ae82921SPaul Mullowney   } catch (char *ex) {
9209ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9219ae82921SPaul Mullowney   }
9229ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
9239ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
924ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
9259ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
926ca45077fSPaul Mullowney   }
927*087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
928*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
929*087f3262SPaul Mullowney   } else {
9309ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
931*087f3262SPaul Mullowney   }
9329ae82921SPaul Mullowney   PetscFunctionReturn(0);
9339ae82921SPaul Mullowney }
9349ae82921SPaul Mullowney 
9359ae82921SPaul Mullowney 
9369ae82921SPaul Mullowney #undef __FUNCT__
937ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
938ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
939ca45077fSPaul Mullowney {
940ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
941ca45077fSPaul Mullowney   PetscErrorCode     ierr;
942ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
943bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
944ca45077fSPaul Mullowney 
945ca45077fSPaul Mullowney   PetscFunctionBegin;
946e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
947e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
948bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
949bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
950bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
951bda325fcSPaul Mullowney   }
952ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
953ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
954ca45077fSPaul Mullowney   try {
955*087f3262SPaul Mullowney     ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr);
956ca45077fSPaul Mullowney   } catch (char *ex) {
957ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
958ca45077fSPaul Mullowney   }
959ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
960ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
961ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
962ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
963ca45077fSPaul Mullowney   }
964*087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
965*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
966*087f3262SPaul Mullowney   } else {
967ca45077fSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
968*087f3262SPaul Mullowney   }
969ca45077fSPaul Mullowney   PetscFunctionReturn(0);
970ca45077fSPaul Mullowney }
971ca45077fSPaul Mullowney 
972ca45077fSPaul Mullowney #undef __FUNCT__
9739ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
9749ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
9759ae82921SPaul Mullowney {
9769ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
9779ae82921SPaul Mullowney   PetscErrorCode     ierr;
9789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
979bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
9806e111a19SKarl Rupp 
9819ae82921SPaul Mullowney   PetscFunctionBegin;
982e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
983e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
9849ae82921SPaul Mullowney   try {
9859ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
9869ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
9879ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
9889ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
9899ae82921SPaul Mullowney 
990e057df02SPaul Mullowney     /* multiply add */
9919ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
9929ae82921SPaul Mullowney 
9939ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
9949ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
9959ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
9969ae82921SPaul Mullowney 
9979ae82921SPaul Mullowney   } catch(char *ex) {
9989ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9999ae82921SPaul Mullowney   }
10009ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1001*087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
1002*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1003*087f3262SPaul Mullowney   } else {
10049ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1005*087f3262SPaul Mullowney   }
10069ae82921SPaul Mullowney   PetscFunctionReturn(0);
10079ae82921SPaul Mullowney }
10089ae82921SPaul Mullowney 
10099ae82921SPaul Mullowney #undef __FUNCT__
1010ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1011ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1012ca45077fSPaul Mullowney {
1013ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1014ca45077fSPaul Mullowney   PetscErrorCode     ierr;
1015ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1016ca45077fSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
10176e111a19SKarl Rupp 
1018ca45077fSPaul Mullowney   PetscFunctionBegin;
1019e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1020e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1021bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
1022bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1023bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
1024bda325fcSPaul Mullowney   }
1025ca45077fSPaul Mullowney   try {
1026ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1027ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1028ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1029ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1030ca45077fSPaul Mullowney 
1031e057df02SPaul Mullowney     /* multiply add with matrix transpose */
1032*087f3262SPaul Mullowney     ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr);
1033ca45077fSPaul Mullowney 
1034ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1035ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1036ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1037ca45077fSPaul Mullowney 
1038ca45077fSPaul Mullowney   } catch(char *ex) {
1039ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1040ca45077fSPaul Mullowney   }
1041ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1042*087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
1043*087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1044*087f3262SPaul Mullowney   } else {
1045ca45077fSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1046*087f3262SPaul Mullowney   }
1047ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1048ca45077fSPaul Mullowney }
1049ca45077fSPaul Mullowney 
1050ca45077fSPaul Mullowney #undef __FUNCT__
10519ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
10529ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
10539ae82921SPaul Mullowney {
10549ae82921SPaul Mullowney   PetscErrorCode ierr;
10556e111a19SKarl Rupp 
10569ae82921SPaul Mullowney   PetscFunctionBegin;
10579ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1058e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
10599ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1060bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1061bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1062bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1063bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
10649ae82921SPaul Mullowney   PetscFunctionReturn(0);
10659ae82921SPaul Mullowney }
10669ae82921SPaul Mullowney 
10679ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
10689ae82921SPaul Mullowney #undef __FUNCT__
10699ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1070e057df02SPaul Mullowney /*@
10719ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1072e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1073e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1074e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1075e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1076e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
10779ae82921SPaul Mullowney 
10789ae82921SPaul Mullowney    Collective on MPI_Comm
10799ae82921SPaul Mullowney 
10809ae82921SPaul Mullowney    Input Parameters:
10819ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
10829ae82921SPaul Mullowney .  m - number of rows
10839ae82921SPaul Mullowney .  n - number of columns
10849ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
10859ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
10860298fd71SBarry Smith          (possibly different for each row) or NULL
10879ae82921SPaul Mullowney 
10889ae82921SPaul Mullowney    Output Parameter:
10899ae82921SPaul Mullowney .  A - the matrix
10909ae82921SPaul Mullowney 
10919ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
10929ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
10939ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
10949ae82921SPaul Mullowney 
10959ae82921SPaul Mullowney    Notes:
10969ae82921SPaul Mullowney    If nnz is given then nz is ignored
10979ae82921SPaul Mullowney 
10989ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
10999ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
11009ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
11019ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
11029ae82921SPaul Mullowney 
11039ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
11040298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11059ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
11069ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
11079ae82921SPaul Mullowney 
11089ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
11099ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
11109ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
11119ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
11129ae82921SPaul Mullowney 
11139ae82921SPaul Mullowney    Level: intermediate
11149ae82921SPaul Mullowney 
1115e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
11169ae82921SPaul Mullowney @*/
11179ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
11189ae82921SPaul Mullowney {
11199ae82921SPaul Mullowney   PetscErrorCode ierr;
11209ae82921SPaul Mullowney 
11219ae82921SPaul Mullowney   PetscFunctionBegin;
11229ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
11239ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
11249ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
11259ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11269ae82921SPaul Mullowney   PetscFunctionReturn(0);
11279ae82921SPaul Mullowney }
11289ae82921SPaul Mullowney 
11299ae82921SPaul Mullowney #undef __FUNCT__
11309ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
11319ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
11329ae82921SPaul Mullowney {
11339ae82921SPaul Mullowney   PetscErrorCode     ierr;
11349ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
11359ae82921SPaul Mullowney 
11369ae82921SPaul Mullowney   PetscFunctionBegin;
11379ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
11389ae82921SPaul Mullowney     try {
11399ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
11409ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
11419ae82921SPaul Mullowney       }
11422205254eSKarl Rupp       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
11439ae82921SPaul Mullowney       delete cusparseMat;
11449ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
11459ae82921SPaul Mullowney     } catch(char *ex) {
11469ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
11479ae82921SPaul Mullowney     }
11489ae82921SPaul Mullowney   } else {
1149e057df02SPaul Mullowney     /* The triangular factors */
11509ae82921SPaul Mullowney     try {
11519ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
11529ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
11539ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
11549ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatLo;
11559ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatUp;
11569ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
11579ae82921SPaul Mullowney       delete cusparseTriFactors;
11589ae82921SPaul Mullowney     } catch(char *ex) {
11599ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
11609ae82921SPaul Mullowney     }
11619ae82921SPaul Mullowney   }
11629ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
11639ae82921SPaul Mullowney     cusparseStatus_t stat;
11649ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
11652205254eSKarl Rupp 
11669ae82921SPaul Mullowney     MAT_cusparseHandle=0;
11679ae82921SPaul Mullowney   }
11689ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
11699ae82921SPaul Mullowney   A->spptr = 0;
11709ae82921SPaul Mullowney 
11719ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
11729ae82921SPaul Mullowney   PetscFunctionReturn(0);
11739ae82921SPaul Mullowney }
11749ae82921SPaul Mullowney 
11759ae82921SPaul Mullowney #undef __FUNCT__
11769ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
11778cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
11789ae82921SPaul Mullowney {
11799ae82921SPaul Mullowney   PetscErrorCode ierr;
11809ae82921SPaul Mullowney 
11819ae82921SPaul Mullowney   PetscFunctionBegin;
11829ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
11839ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
1184e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
1185e057df02SPaul Mullowney        now build a GPU matrix data structure */
11869ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
11872205254eSKarl Rupp 
11889ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
11899ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
1190e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1191bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
1192*087f3262SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE;
11939ae82921SPaul Mullowney   } else {
11949ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
1195debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
11962205254eSKarl Rupp 
11979ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
11989ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
11999ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
1200e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
1201bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
1202*087f3262SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm   = PETSC_FALSE;
12039ae82921SPaul Mullowney   }
1204e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
12059ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
12069ae82921SPaul Mullowney     cusparseStatus_t stat;
12079ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
12089ae82921SPaul Mullowney   }
1209e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1210e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
1211e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
121200de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
12132205254eSKarl Rupp 
12149ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
12159ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
12169ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
12179ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1218ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1219ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1220ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1221ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
12222205254eSKarl Rupp 
12239ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
12242205254eSKarl Rupp 
12259ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
12262205254eSKarl Rupp 
122700de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
12289ae82921SPaul Mullowney   PetscFunctionReturn(0);
12299ae82921SPaul Mullowney }
12309ae82921SPaul Mullowney 
1231e057df02SPaul Mullowney /*M
1232e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1233e057df02SPaul Mullowney 
1234e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1235e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1236e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1237e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1238e057df02SPaul Mullowney    the different GPU storage formats.
1239e057df02SPaul Mullowney 
1240e057df02SPaul Mullowney    Options Database Keys:
1241e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
12428468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
12438468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
12448468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1245e057df02SPaul Mullowney 
1246e057df02SPaul Mullowney   Level: beginner
1247e057df02SPaul Mullowney 
12488468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1249e057df02SPaul Mullowney M*/
1250