xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bc3f50f23de72ffb0a81d3be97c02ce36d7be6cb)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3*bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
99ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
109ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
119ae82921SPaul Mullowney #undef VecType
129ae82921SPaul Mullowney #include "cusparsematimpl.h"
13*bc3f50f2SPaul Mullowney 
14e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
159ae82921SPaul Mullowney 
16e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
17e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
19e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
20e057df02SPaul Mullowney    and reuse it. */
21ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
229ae82921SPaul Mullowney 
23087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
24087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
25087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26087f3262SPaul Mullowney 
279ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
289ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
299ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
30087f3262SPaul Mullowney 
319ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
329ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
33bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
359ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
368dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
378dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
388dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
398dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
409ae82921SPaul Mullowney 
419ae82921SPaul Mullowney #undef __FUNCT__
429ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
439ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
449ae82921SPaul Mullowney {
459ae82921SPaul Mullowney   PetscFunctionBegin;
469ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
479ae82921SPaul Mullowney   PetscFunctionReturn(0);
489ae82921SPaul Mullowney }
499ae82921SPaul Mullowney 
50c708e6cdSJed Brown /*MC
51087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
52087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
53087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
54087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
55087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
56087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
57c708e6cdSJed Brown 
58c708e6cdSJed Brown   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
59c708e6cdSJed Brown 
60087f3262SPaul Mullowney   Consult CUSPARSE documentation for more information about the matrix storage formats
61087f3262SPaul Mullowney   which correspond to the options database keys below.
62c708e6cdSJed Brown 
63c708e6cdSJed Brown    Options Database Keys:
64087f3262SPaul Mullowney .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
659ae82921SPaul Mullowney 
669ae82921SPaul Mullowney   Level: beginner
67c708e6cdSJed Brown 
68c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
69c708e6cdSJed Brown M*/
709ae82921SPaul Mullowney 
719ae82921SPaul Mullowney #undef __FUNCT__
729ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
733c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
749ae82921SPaul Mullowney {
759ae82921SPaul Mullowney   PetscErrorCode ierr;
76*bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
779ae82921SPaul Mullowney 
789ae82921SPaul Mullowney   PetscFunctionBegin;
79*bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
80*bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
819ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
82*bc3f50f2SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
83*bc3f50f2SPaul Mullowney 
84087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
85*bc3f50f2SPaul Mullowney     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
869ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
879ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
88087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
89087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
90087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
919ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
92*bc3f50f2SPaul Mullowney 
93087f3262SPaul Mullowney   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
949ae82921SPaul Mullowney   (*B)->factortype = ftype;
959ae82921SPaul Mullowney   PetscFunctionReturn(0);
969ae82921SPaul Mullowney }
979ae82921SPaul Mullowney 
989ae82921SPaul Mullowney #undef __FUNCT__
99e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
100*bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
101ca45077fSPaul Mullowney {
102ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1036e111a19SKarl Rupp 
104ca45077fSPaul Mullowney   PetscFunctionBegin;
105ca45077fSPaul Mullowney   switch (op) {
106e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
107e057df02SPaul Mullowney     cusparseMat->format = format;
108ca45077fSPaul Mullowney     break;
109e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
110e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
111ca45077fSPaul Mullowney     break;
112e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
113e057df02SPaul Mullowney     cusparseMat->format           = format;
114e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
115ca45077fSPaul Mullowney     break;
116ca45077fSPaul Mullowney   default:
117e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
118ca45077fSPaul Mullowney   }
119ca45077fSPaul Mullowney   PetscFunctionReturn(0);
120ca45077fSPaul Mullowney }
1219ae82921SPaul Mullowney 
122e057df02SPaul Mullowney /*@
123e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
124e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
1258468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
126e057df02SPaul Mullowney    to build/install PETSc to use this package.
127e057df02SPaul Mullowney 
128e057df02SPaul Mullowney    Not Collective
129e057df02SPaul Mullowney 
130e057df02SPaul Mullowney    Input Parameters:
1318468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1328468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1338468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
134e057df02SPaul Mullowney 
135e057df02SPaul Mullowney    Output Parameter:
136e057df02SPaul Mullowney 
137e057df02SPaul Mullowney    Level: intermediate
138e057df02SPaul Mullowney 
1398468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
140e057df02SPaul Mullowney @*/
141e057df02SPaul Mullowney #undef __FUNCT__
142e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
143e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144e057df02SPaul Mullowney {
145e057df02SPaul Mullowney   PetscErrorCode ierr;
1466e111a19SKarl Rupp 
147e057df02SPaul Mullowney   PetscFunctionBegin;
148e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
149e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
150e057df02SPaul Mullowney   PetscFunctionReturn(0);
151e057df02SPaul Mullowney }
152e057df02SPaul Mullowney 
1539ae82921SPaul Mullowney #undef __FUNCT__
1549ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1559ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1569ae82921SPaul Mullowney {
1579ae82921SPaul Mullowney   PetscErrorCode           ierr;
158e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1599ae82921SPaul Mullowney   PetscBool                flg;
1606e111a19SKarl Rupp 
1619ae82921SPaul Mullowney   PetscFunctionBegin;
162e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
163e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1649ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
165e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1667083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
167e057df02SPaul Mullowney     if (flg) {
168e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
169045c96e1SPaul Mullowney     }
1702205254eSKarl Rupp   } else {
171e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1727083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173e057df02SPaul Mullowney     if (flg) {
174e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
175e057df02SPaul Mullowney     }
1769ae82921SPaul Mullowney   }
1774c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1787083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1794c87dfd4SPaul Mullowney   if (flg) {
1804c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1814c87dfd4SPaul Mullowney   }
1829ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1839ae82921SPaul Mullowney   PetscFunctionReturn(0);
1849ae82921SPaul Mullowney 
1859ae82921SPaul Mullowney }
1869ae82921SPaul Mullowney 
1879ae82921SPaul Mullowney #undef __FUNCT__
1889ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1899ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1909ae82921SPaul Mullowney {
1919ae82921SPaul Mullowney   PetscErrorCode ierr;
1929ae82921SPaul Mullowney 
1939ae82921SPaul Mullowney   PetscFunctionBegin;
1949ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1959ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1969ae82921SPaul Mullowney   PetscFunctionReturn(0);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
1999ae82921SPaul Mullowney #undef __FUNCT__
2009ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
2019ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2029ae82921SPaul Mullowney {
2039ae82921SPaul Mullowney   PetscErrorCode ierr;
2049ae82921SPaul Mullowney 
2059ae82921SPaul Mullowney   PetscFunctionBegin;
2069ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2079ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2089ae82921SPaul Mullowney   PetscFunctionReturn(0);
2099ae82921SPaul Mullowney }
2109ae82921SPaul Mullowney 
2119ae82921SPaul Mullowney #undef __FUNCT__
212087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
213087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214087f3262SPaul Mullowney {
215087f3262SPaul Mullowney   PetscErrorCode ierr;
216087f3262SPaul Mullowney 
217087f3262SPaul Mullowney   PetscFunctionBegin;
218087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220087f3262SPaul Mullowney   PetscFunctionReturn(0);
221087f3262SPaul Mullowney }
222087f3262SPaul Mullowney 
223087f3262SPaul Mullowney #undef __FUNCT__
224087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
225087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226087f3262SPaul Mullowney {
227087f3262SPaul Mullowney   PetscErrorCode ierr;
228087f3262SPaul Mullowney 
229087f3262SPaul Mullowney   PetscFunctionBegin;
230087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232087f3262SPaul Mullowney   PetscFunctionReturn(0);
233087f3262SPaul Mullowney }
234087f3262SPaul Mullowney 
235087f3262SPaul Mullowney #undef __FUNCT__
236087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
237087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2389ae82921SPaul Mullowney {
2399ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2409ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2419ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2429ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMat        = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2439ae82921SPaul Mullowney   cusparseStatus_t             stat;
2449ae82921SPaul Mullowney   const PetscInt               *ai = a->i,*aj = a->j,*vi;
2459ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2469ae82921SPaul Mullowney   PetscInt                     *AiLo, *AjLo;
2479ae82921SPaul Mullowney   PetscScalar                  *AALo;
2489ae82921SPaul Mullowney   PetscInt                     i,nz, nzLower, offset, rowOffset;
249b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
2509ae82921SPaul Mullowney 
2519ae82921SPaul Mullowney   PetscFunctionBegin;
2529ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2539ae82921SPaul Mullowney     try {
2549ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2559ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2569ae82921SPaul Mullowney 
2579ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2589ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2599ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2609ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2619ae82921SPaul Mullowney 
2629ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2639ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2649ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2659ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2669ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2679ae82921SPaul Mullowney       v        = aa;
2689ae82921SPaul Mullowney       vi       = aj;
2699ae82921SPaul Mullowney       offset   = 1;
2709ae82921SPaul Mullowney       rowOffset= 1;
2719ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2729ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
273e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2749ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2759ae82921SPaul Mullowney         rowOffset += nz+1;
2769ae82921SPaul Mullowney 
2779ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2789ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2799ae82921SPaul Mullowney 
2809ae82921SPaul Mullowney         offset      += nz;
2819ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2829ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2839ae82921SPaul Mullowney         offset      += 1;
2849ae82921SPaul Mullowney 
2859ae82921SPaul Mullowney         v  += nz;
2869ae82921SPaul Mullowney         vi += nz;
2879ae82921SPaul Mullowney       }
288e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2892205254eSKarl Rupp 
290087f3262SPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
2919ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2929ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2932205254eSKarl Rupp 
2949ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2952205254eSKarl Rupp 
2969ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2979ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2989ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2999ae82921SPaul Mullowney     } catch(char *ex) {
3009ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3019ae82921SPaul Mullowney     }
3029ae82921SPaul Mullowney   }
3039ae82921SPaul Mullowney   PetscFunctionReturn(0);
3049ae82921SPaul Mullowney }
3059ae82921SPaul Mullowney 
3069ae82921SPaul Mullowney #undef __FUNCT__
307087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
308087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3099ae82921SPaul Mullowney {
3109ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3119ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
3129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3139ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMat        = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
3149ae82921SPaul Mullowney   cusparseStatus_t             stat;
3159ae82921SPaul Mullowney   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
3169ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
3179ae82921SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
3189ae82921SPaul Mullowney   PetscScalar                  *AAUp;
3199ae82921SPaul Mullowney   PetscInt                     i,nz, nzUpper, offset;
3209ae82921SPaul Mullowney   PetscErrorCode               ierr;
3219ae82921SPaul Mullowney 
3229ae82921SPaul Mullowney   PetscFunctionBegin;
3239ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
3249ae82921SPaul Mullowney     try {
3259ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3269ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3299ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3309ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3319ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3329ae82921SPaul Mullowney 
3339ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3349ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3359ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3369ae82921SPaul Mullowney       offset = nzUpper;
3379ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3389ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3399ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3409ae82921SPaul Mullowney 
341e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3429ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3439ae82921SPaul Mullowney 
344e057df02SPaul Mullowney         /* decrement the offset */
3459ae82921SPaul Mullowney         offset -= (nz+1);
3469ae82921SPaul Mullowney 
347e057df02SPaul Mullowney         /* first, set the diagonal elements */
3489ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3499ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3509ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3519ae82921SPaul Mullowney 
3529ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3539ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3549ae82921SPaul Mullowney       }
355e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3562205254eSKarl Rupp 
357087f3262SPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
3589ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3599ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3602205254eSKarl Rupp 
3619ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3622205254eSKarl Rupp 
3639ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3649ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3659ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3669ae82921SPaul Mullowney     } catch(char *ex) {
3679ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3689ae82921SPaul Mullowney     }
3699ae82921SPaul Mullowney   }
3709ae82921SPaul Mullowney   PetscFunctionReturn(0);
3719ae82921SPaul Mullowney }
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney #undef __FUNCT__
374087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
375087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
3769ae82921SPaul Mullowney {
3779ae82921SPaul Mullowney   PetscErrorCode               ierr;
3789ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3799ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3809ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
3819ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
3829ae82921SPaul Mullowney   const PetscInt               *r,*c;
3839ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
3849ae82921SPaul Mullowney 
3859ae82921SPaul Mullowney   PetscFunctionBegin;
386087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
387087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
3882205254eSKarl Rupp 
3899ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3909ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3919ae82921SPaul Mullowney 
3929ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
393e057df02SPaul Mullowney   /*lower triangular indices */
3949ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3959ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3962205254eSKarl Rupp   if (!row_identity) {
3979ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3982205254eSKarl Rupp   }
3999ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4009ae82921SPaul Mullowney 
401e057df02SPaul Mullowney   /*upper triangular indices */
4029ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
4039ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4042205254eSKarl Rupp   if (!col_identity) {
4059ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
4062205254eSKarl Rupp   }
4079ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
4089ae82921SPaul Mullowney   PetscFunctionReturn(0);
4099ae82921SPaul Mullowney }
4109ae82921SPaul Mullowney 
4119ae82921SPaul Mullowney #undef __FUNCT__
412087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
413087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
414087f3262SPaul Mullowney {
415087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
416087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417087f3262SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418087f3262SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419087f3262SPaul Mullowney   cusparseStatus_t             stat;
420087f3262SPaul Mullowney   PetscErrorCode               ierr;
421087f3262SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
422087f3262SPaul Mullowney   PetscScalar                  *AAUp;
423087f3262SPaul Mullowney   PetscScalar                  *AALo;
424087f3262SPaul Mullowney   PetscInt                     nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
425087f3262SPaul Mullowney   Mat_SeqSBAIJ                 *b = (Mat_SeqSBAIJ*)A->data;
426087f3262SPaul Mullowney   const PetscInt               *ai = b->i,*aj = b->j,*vj;
427087f3262SPaul Mullowney   const MatScalar              *aa = b->a,*v;
428087f3262SPaul Mullowney 
429087f3262SPaul Mullowney   PetscFunctionBegin;
430087f3262SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
431087f3262SPaul Mullowney     try {
432087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
433087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
434087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
435087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
436087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
437087f3262SPaul Mullowney 
438087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
439087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
440087f3262SPaul Mullowney       AiUp[n]=nzUpper;
441087f3262SPaul Mullowney       offset = 0;
442087f3262SPaul Mullowney       for (i=0; i<n; i++) {
443087f3262SPaul Mullowney         /* set the pointers */
444087f3262SPaul Mullowney         v  = aa + ai[i];
445087f3262SPaul Mullowney         vj = aj + ai[i];
446087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
447087f3262SPaul Mullowney 
448087f3262SPaul Mullowney         /* first, set the diagonal elements */
449087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
450087f3262SPaul Mullowney         AAUp[offset] = 1.0/v[nz];
451087f3262SPaul Mullowney         AiUp[i]      = offset;
452087f3262SPaul Mullowney         AALo[offset] = 1.0/v[nz];
453087f3262SPaul Mullowney 
454087f3262SPaul Mullowney         offset+=1;
455087f3262SPaul Mullowney         if (nz>0) {
456087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
457087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
458087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
459087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
460087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
461087f3262SPaul Mullowney           }
462087f3262SPaul Mullowney           offset+=nz;
463087f3262SPaul Mullowney         }
464087f3262SPaul Mullowney       }
465087f3262SPaul Mullowney 
466087f3262SPaul Mullowney       /* Build the upper triangular piece */
467087f3262SPaul Mullowney       cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
468087f3262SPaul Mullowney       stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
469087f3262SPaul Mullowney       ierr = cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
470087f3262SPaul Mullowney       stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat);
471087f3262SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp;
472087f3262SPaul Mullowney 
473087f3262SPaul Mullowney       /* Build the lower triangular piece */
474087f3262SPaul Mullowney       cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
475087f3262SPaul Mullowney       stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
476087f3262SPaul Mullowney       ierr = cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr);
477087f3262SPaul Mullowney       stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat);
478087f3262SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo;
479087f3262SPaul Mullowney 
480087f3262SPaul Mullowney       /* set this flag ... for performance logging */
481087f3262SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE;
482087f3262SPaul Mullowney 
483087f3262SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484087f3262SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
485087f3262SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
486087f3262SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
487087f3262SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
488087f3262SPaul Mullowney     } catch(char *ex) {
489087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490087f3262SPaul Mullowney     }
491087f3262SPaul Mullowney   }
492087f3262SPaul Mullowney   PetscFunctionReturn(0);
493087f3262SPaul Mullowney }
494087f3262SPaul Mullowney 
495087f3262SPaul Mullowney #undef __FUNCT__
496087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
497087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498087f3262SPaul Mullowney {
499087f3262SPaul Mullowney   PetscErrorCode               ierr;
500087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
501087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502087f3262SPaul Mullowney   IS                           ip = a->row;
503087f3262SPaul Mullowney   const PetscInt               *rip;
504087f3262SPaul Mullowney   PetscBool                    perm_identity;
505087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
506087f3262SPaul Mullowney 
507087f3262SPaul Mullowney   PetscFunctionBegin;
508087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
509087f3262SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
510087f3262SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
511087f3262SPaul Mullowney   /*lower triangular indices */
512087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
513087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
514087f3262SPaul Mullowney   if (!perm_identity) {
515087f3262SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
516087f3262SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
517087f3262SPaul Mullowney   }
518087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
519087f3262SPaul Mullowney   PetscFunctionReturn(0);
520087f3262SPaul Mullowney }
521087f3262SPaul Mullowney 
522087f3262SPaul Mullowney #undef __FUNCT__
5239ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
5249ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
5259ae82921SPaul Mullowney {
5269ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
5279ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
5289ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
529b175d8bbSPaul Mullowney   PetscErrorCode ierr;
5309ae82921SPaul Mullowney 
5319ae82921SPaul Mullowney   PetscFunctionBegin;
5329ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
533e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
5349ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5359ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
536bda325fcSPaul Mullowney   if (row_identity && col_identity) {
537bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
538bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
539bda325fcSPaul Mullowney   } else {
540bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
541bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
542bda325fcSPaul Mullowney   }
5438dc1d2a3SPaul Mullowney 
544e057df02SPaul Mullowney   /* get the triangular factors */
545087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
5469ae82921SPaul Mullowney   PetscFunctionReturn(0);
5479ae82921SPaul Mullowney }
5489ae82921SPaul Mullowney 
549087f3262SPaul Mullowney #undef __FUNCT__
550087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
551087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
552087f3262SPaul Mullowney {
553087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
554087f3262SPaul Mullowney   IS             ip = b->row;
555087f3262SPaul Mullowney   PetscBool      perm_identity;
556b175d8bbSPaul Mullowney   PetscErrorCode ierr;
557087f3262SPaul Mullowney 
558087f3262SPaul Mullowney   PetscFunctionBegin;
559087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
560087f3262SPaul Mullowney 
561087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
562087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
563087f3262SPaul Mullowney   if (perm_identity) {
564087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
565087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
566087f3262SPaul Mullowney   } else {
567087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
568087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
569087f3262SPaul Mullowney   }
570087f3262SPaul Mullowney 
571087f3262SPaul Mullowney   /* get the triangular factors */
572087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
573087f3262SPaul Mullowney   PetscFunctionReturn(0);
574087f3262SPaul Mullowney }
5759ae82921SPaul Mullowney 
576bda325fcSPaul Mullowney #undef __FUNCT__
577bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
578b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
579bda325fcSPaul Mullowney {
580bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
581bda325fcSPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
582bda325fcSPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
583bda325fcSPaul Mullowney   cusparseStatus_t             stat;
584b175d8bbSPaul Mullowney 
585bda325fcSPaul Mullowney   PetscFunctionBegin;
586087f3262SPaul Mullowney   stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle,
587bda325fcSPaul Mullowney                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
588bda325fcSPaul Mullowney                                                        CUSPARSE_FILL_MODE_UPPER,
589087f3262SPaul Mullowney                                                        CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
590087f3262SPaul Mullowney   stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat);
591bda325fcSPaul Mullowney 
592087f3262SPaul Mullowney   stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle,
593bda325fcSPaul Mullowney                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
594bda325fcSPaul Mullowney                                                        CUSPARSE_FILL_MODE_LOWER,
595087f3262SPaul Mullowney                                                        CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
596087f3262SPaul Mullowney   stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat);
597bda325fcSPaul Mullowney   PetscFunctionReturn(0);
598bda325fcSPaul Mullowney }
599bda325fcSPaul Mullowney 
600bda325fcSPaul Mullowney #undef __FUNCT__
601bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
602b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
603bda325fcSPaul Mullowney {
604bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
605bda325fcSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
606bda325fcSPaul Mullowney   cusparseStatus_t   stat;
607b175d8bbSPaul Mullowney   PetscErrorCode     ierr;
608b175d8bbSPaul Mullowney 
609bda325fcSPaul Mullowney   PetscFunctionBegin;
610087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
611087f3262SPaul Mullowney     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
612087f3262SPaul Mullowney   } else {
613087f3262SPaul Mullowney     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
614087f3262SPaul Mullowney   }
615087f3262SPaul Mullowney   ierr = cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr);
616bda325fcSPaul Mullowney   PetscFunctionReturn(0);
617bda325fcSPaul Mullowney }
618bda325fcSPaul Mullowney 
619bda325fcSPaul Mullowney #undef __FUNCT__
620bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
621bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
622bda325fcSPaul Mullowney {
623bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
624bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
625bda325fcSPaul Mullowney   cusparseStatus_t             stat;
626bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627bda325fcSPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
628bda325fcSPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
629bda325fcSPaul Mullowney   CUSPARRAY                    *tempGPU            = (CUSPARRAY*) cusparseTriFactors->tempvec;
630b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
631bda325fcSPaul Mullowney 
632bda325fcSPaul Mullowney   PetscFunctionBegin;
633bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
634bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
635bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
636bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
637bda325fcSPaul Mullowney   }
638bda325fcSPaul Mullowney 
639bda325fcSPaul Mullowney   /* Get the GPU pointers */
640bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
641bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
642bda325fcSPaul Mullowney 
643bda325fcSPaul Mullowney   /* solve with reordering */
644bda325fcSPaul Mullowney   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
645087f3262SPaul Mullowney   stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat);
646087f3262SPaul Mullowney   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
647bda325fcSPaul Mullowney   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
648bda325fcSPaul Mullowney 
649bda325fcSPaul Mullowney   /* restore */
650bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
651bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
652bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
653087f3262SPaul Mullowney 
654087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
655087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
656087f3262SPaul Mullowney   } else {
657bda325fcSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
658087f3262SPaul Mullowney   }
659bda325fcSPaul Mullowney   PetscFunctionReturn(0);
660bda325fcSPaul Mullowney }
661bda325fcSPaul Mullowney 
662bda325fcSPaul Mullowney #undef __FUNCT__
663bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
664bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
665bda325fcSPaul Mullowney {
666bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
667bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU,*bGPU;
668bda325fcSPaul Mullowney   cusparseStatus_t             stat;
669bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
670bda325fcSPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
671bda325fcSPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
672bda325fcSPaul Mullowney   CUSPARRAY                    *tempGPU            = (CUSPARRAY*) cusparseTriFactors->tempvec;
673b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
674bda325fcSPaul Mullowney 
675bda325fcSPaul Mullowney   PetscFunctionBegin;
676bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
677bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
678bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
679bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
680bda325fcSPaul Mullowney   }
681bda325fcSPaul Mullowney 
682bda325fcSPaul Mullowney   /* Get the GPU pointers */
683bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
684bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
685bda325fcSPaul Mullowney 
686bda325fcSPaul Mullowney   /* solve */
687087f3262SPaul Mullowney   stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat);
688087f3262SPaul Mullowney   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
689bda325fcSPaul Mullowney 
690bda325fcSPaul Mullowney   /* restore */
691bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
692bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
693bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
694087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
695087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
696087f3262SPaul Mullowney   } else {
697bda325fcSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
698087f3262SPaul Mullowney   }
699bda325fcSPaul Mullowney   PetscFunctionReturn(0);
700bda325fcSPaul Mullowney }
701bda325fcSPaul Mullowney 
7029ae82921SPaul Mullowney #undef __FUNCT__
7039ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
7049ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
7059ae82921SPaul Mullowney {
7069ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
707bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU,*bGPU;
7089ae82921SPaul Mullowney   cusparseStatus_t             stat;
7099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7109ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
7119ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
7129ae82921SPaul Mullowney   CUSPARRAY                    *tempGPU            = (CUSPARRAY*)cusparseTriFactors->tempvec;
713b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
7149ae82921SPaul Mullowney 
7159ae82921SPaul Mullowney   PetscFunctionBegin;
716e057df02SPaul Mullowney   /* Get the GPU pointers */
7179ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7189ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
7199ae82921SPaul Mullowney 
720e057df02SPaul Mullowney   /* solve with reordering */
7219ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
7229ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
7239ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
7249ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
7259ae82921SPaul Mullowney 
7269ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
7279ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7289ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
729087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
730087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
731087f3262SPaul Mullowney   } else {
7329ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
733087f3262SPaul Mullowney   }
7349ae82921SPaul Mullowney   PetscFunctionReturn(0);
7359ae82921SPaul Mullowney }
7369ae82921SPaul Mullowney 
7379ae82921SPaul Mullowney #undef __FUNCT__
7389ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
7399ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
7409ae82921SPaul Mullowney {
7419ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
742bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU,*bGPU;
7439ae82921SPaul Mullowney   cusparseStatus_t             stat;
7449ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7459ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
7469ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
7479ae82921SPaul Mullowney   CUSPARRAY                    *tempGPU            = (CUSPARRAY*)cusparseTriFactors->tempvec;
748b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
7499ae82921SPaul Mullowney 
7509ae82921SPaul Mullowney   PetscFunctionBegin;
751e057df02SPaul Mullowney   /* Get the GPU pointers */
7529ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7539ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
7549ae82921SPaul Mullowney 
755e057df02SPaul Mullowney   /* solve */
7569ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
7579ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
7589ae82921SPaul Mullowney 
7599ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
7609ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
7619ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
762087f3262SPaul Mullowney   if (cusparseTriFactors->isSymmOrHerm) {
763087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);CHKERRQ(ierr);
764087f3262SPaul Mullowney   } else {
7659ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
766087f3262SPaul Mullowney   }
7679ae82921SPaul Mullowney   PetscFunctionReturn(0);
7689ae82921SPaul Mullowney }
7699ae82921SPaul Mullowney 
7709ae82921SPaul Mullowney #undef __FUNCT__
771e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
772e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
7739ae82921SPaul Mullowney {
7749ae82921SPaul Mullowney 
7759ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
7769ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
7779ae82921SPaul Mullowney   PetscInt           m = A->rmap->n,*ii,*ridx;
778087f3262SPaul Mullowney   PetscBool          symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE;
779087f3262SPaul Mullowney   PetscBool          symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE;
780b175d8bbSPaul Mullowney   PetscErrorCode     ierr;
7819ae82921SPaul Mullowney 
7829ae82921SPaul Mullowney   PetscFunctionBegin;
7839ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
7849ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
7859ae82921SPaul Mullowney     /*
7869ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
7879ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
7889ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
7899ae82921SPaul Mullowney     */
7909ae82921SPaul Mullowney     if (cusparseMat->mat) {
7919ae82921SPaul Mullowney       try {
7929ae82921SPaul Mullowney         delete cusparseMat->mat;
7932205254eSKarl Rupp         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
7949ae82921SPaul Mullowney 
7959ae82921SPaul Mullowney       } catch(char *ex) {
7969ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7979ae82921SPaul Mullowney       }
7989ae82921SPaul Mullowney     }
7999ae82921SPaul Mullowney     try {
8009ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
8012205254eSKarl Rupp       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
8029ae82921SPaul Mullowney 
8039ae82921SPaul Mullowney       if (a->compressedrow.use) {
8049ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
8059ae82921SPaul Mullowney         ii   = a->compressedrow.i;
8069ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
8079ae82921SPaul Mullowney       } else {
808e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
8099ae82921SPaul Mullowney         int k=0;
8109ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
8119ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
8129ae82921SPaul Mullowney         ii[0]=0;
8139ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
8149ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
8159ae82921SPaul Mullowney             ii[k]  = a->i[j];
8169ae82921SPaul Mullowney             ridx[k]= j;
8179ae82921SPaul Mullowney             k++;
8189ae82921SPaul Mullowney           }
8199ae82921SPaul Mullowney         }
8209ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
8212205254eSKarl Rupp 
8229ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
8239ae82921SPaul Mullowney       }
8249ae82921SPaul Mullowney 
825e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
826e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
8279ae82921SPaul Mullowney 
828e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
8299ae82921SPaul Mullowney       PetscMPIInt size;
830*bc3f50f2SPaul Mullowney       ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
8319ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
8329ae82921SPaul Mullowney 
833*bc3f50f2SPaul Mullowney       ierr = MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest);CHKERRQ(ierr);
834087f3262SPaul Mullowney       if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) {
835087f3262SPaul Mullowney 	/* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
836087f3262SPaul Mullowney         cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
837087f3262SPaul Mullowney         cusparseMat->isSymmOrHerm = PETSC_FALSE;
838087f3262SPaul Mullowney       } else {
839*bc3f50f2SPaul Mullowney         ierr = MatIsSymmetric(A,0.0,&symmetryTest);CHKERRQ(ierr);
840087f3262SPaul Mullowney         if (symmetryTest) {
841087f3262SPaul Mullowney           cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
842087f3262SPaul Mullowney           cusparseMat->isSymmOrHerm = PETSC_TRUE;
843087f3262SPaul Mullowney         } else {
844*bc3f50f2SPaul Mullowney           ierr = MatIsHermitian(A,0.0,&hermitianTest);CHKERRQ(ierr);
845087f3262SPaul Mullowney           if (hermitianTest) {
846087f3262SPaul Mullowney             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
847087f3262SPaul Mullowney             cusparseMat->isSymmOrHerm = PETSC_TRUE;
848087f3262SPaul Mullowney           } else {
849087f3262SPaul Mullowney             /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
850087f3262SPaul Mullowney             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
851087f3262SPaul Mullowney             cusparseMat->isSymmOrHerm = PETSC_FALSE;
852087f3262SPaul Mullowney           }
853087f3262SPaul Mullowney         }
854087f3262SPaul Mullowney       }
855ca45077fSPaul Mullowney 
856e057df02SPaul Mullowney       /* lastly, build the matrix */
8579ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
8589ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
8599ae82921SPaul Mullowney       if (!a->compressedrow.use) {
8609ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
8619ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
8629ae82921SPaul Mullowney       }
8639ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
8649ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
8659ae82921SPaul Mullowney     } catch(char *ex) {
8669ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8679ae82921SPaul Mullowney     }
8689ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
8692205254eSKarl Rupp 
8709ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
8712205254eSKarl Rupp 
8729ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
8739ae82921SPaul Mullowney   }
8749ae82921SPaul Mullowney   PetscFunctionReturn(0);
8759ae82921SPaul Mullowney }
8769ae82921SPaul Mullowney 
8779ae82921SPaul Mullowney #undef __FUNCT__
8789ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
8799ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
8809ae82921SPaul Mullowney {
8819ae82921SPaul Mullowney   PetscErrorCode ierr;
8829ae82921SPaul Mullowney 
8839ae82921SPaul Mullowney   PetscFunctionBegin;
8849ae82921SPaul Mullowney   if (right) {
885ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
8869ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
8879ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
8889ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
8899ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
8909ae82921SPaul Mullowney   }
8919ae82921SPaul Mullowney   if (left) {
892ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
8939ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
8949ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
8959ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
8969ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
8979ae82921SPaul Mullowney   }
8989ae82921SPaul Mullowney   PetscFunctionReturn(0);
8999ae82921SPaul Mullowney }
9009ae82921SPaul Mullowney 
9019ae82921SPaul Mullowney #undef __FUNCT__
9029ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
9039ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
9049ae82921SPaul Mullowney {
9059ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
9069ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
907bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
908b175d8bbSPaul Mullowney   PetscErrorCode     ierr;
9099ae82921SPaul Mullowney 
9109ae82921SPaul Mullowney   PetscFunctionBegin;
911e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
912e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
9139ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
9149ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
9159ae82921SPaul Mullowney   try {
9169ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
9179ae82921SPaul Mullowney   } catch (char *ex) {
9189ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9199ae82921SPaul Mullowney   }
9209ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
9219ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
922ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
9239ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
924ca45077fSPaul Mullowney   }
925087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
926087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
927087f3262SPaul Mullowney   } else {
9289ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
929087f3262SPaul Mullowney   }
9309ae82921SPaul Mullowney   PetscFunctionReturn(0);
9319ae82921SPaul Mullowney }
9329ae82921SPaul Mullowney 
9339ae82921SPaul Mullowney #undef __FUNCT__
934ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
935ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
936ca45077fSPaul Mullowney {
937ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
938ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
939bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
940b175d8bbSPaul Mullowney   PetscErrorCode     ierr;
941ca45077fSPaul Mullowney 
942ca45077fSPaul Mullowney   PetscFunctionBegin;
943e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
944e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
945bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
946bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
947bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
948bda325fcSPaul Mullowney   }
949ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
950ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
951ca45077fSPaul Mullowney   try {
952087f3262SPaul Mullowney     ierr = cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr);
953ca45077fSPaul Mullowney   } catch (char *ex) {
954ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955ca45077fSPaul Mullowney   }
956ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
957ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
958ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
959ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
960ca45077fSPaul Mullowney   }
961087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
962087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
963087f3262SPaul Mullowney   } else {
964ca45077fSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
965087f3262SPaul Mullowney   }
966ca45077fSPaul Mullowney   PetscFunctionReturn(0);
967ca45077fSPaul Mullowney }
968ca45077fSPaul Mullowney 
969ca45077fSPaul Mullowney #undef __FUNCT__
9709ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
9719ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
9729ae82921SPaul Mullowney {
9739ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
9749ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
975bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
976b175d8bbSPaul Mullowney   PetscErrorCode     ierr;
9776e111a19SKarl Rupp 
9789ae82921SPaul Mullowney   PetscFunctionBegin;
979e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
980e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
9819ae82921SPaul Mullowney   try {
9829ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
9839ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
9849ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
9859ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
9869ae82921SPaul Mullowney 
987e057df02SPaul Mullowney     /* multiply add */
9889ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
9899ae82921SPaul Mullowney 
9909ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
9919ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
9929ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
9939ae82921SPaul Mullowney 
9949ae82921SPaul Mullowney   } catch(char *ex) {
9959ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9969ae82921SPaul Mullowney   }
9979ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
998087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
999087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1000087f3262SPaul Mullowney   } else {
10019ae82921SPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1002087f3262SPaul Mullowney   }
10039ae82921SPaul Mullowney   PetscFunctionReturn(0);
10049ae82921SPaul Mullowney }
10059ae82921SPaul Mullowney 
10069ae82921SPaul Mullowney #undef __FUNCT__
1007b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1008ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1009ca45077fSPaul Mullowney {
1010ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1011ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1012ca45077fSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
1013b175d8bbSPaul Mullowney   PetscErrorCode     ierr;
10146e111a19SKarl Rupp 
1015ca45077fSPaul Mullowney   PetscFunctionBegin;
1016e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1017e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1018bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
1019bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1020bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
1021bda325fcSPaul Mullowney   }
1022ca45077fSPaul Mullowney   try {
1023ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1024ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1025ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1026ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1027ca45077fSPaul Mullowney 
1028e057df02SPaul Mullowney     /* multiply add with matrix transpose */
1029087f3262SPaul Mullowney     ierr = cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr);
1030ca45077fSPaul Mullowney 
1031ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1032ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1033ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1034ca45077fSPaul Mullowney 
1035ca45077fSPaul Mullowney   } catch(char *ex) {
1036ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1037ca45077fSPaul Mullowney   }
1038ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1039087f3262SPaul Mullowney   if (cusparseMat->isSymmOrHerm) {
1040087f3262SPaul Mullowney     ierr = PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);CHKERRQ(ierr);
1041087f3262SPaul Mullowney   } else {
1042ca45077fSPaul Mullowney     ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1043087f3262SPaul Mullowney   }
1044ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1045ca45077fSPaul Mullowney }
1046ca45077fSPaul Mullowney 
1047ca45077fSPaul Mullowney #undef __FUNCT__
10489ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
10499ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
10509ae82921SPaul Mullowney {
10519ae82921SPaul Mullowney   PetscErrorCode ierr;
10526e111a19SKarl Rupp 
10539ae82921SPaul Mullowney   PetscFunctionBegin;
10549ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1055*bc3f50f2SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1056e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1057*bc3f50f2SPaul Mullowney   }
10589ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1059bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1060bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1061bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1062bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
10639ae82921SPaul Mullowney   PetscFunctionReturn(0);
10649ae82921SPaul Mullowney }
10659ae82921SPaul Mullowney 
10669ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
10679ae82921SPaul Mullowney #undef __FUNCT__
10689ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1069e057df02SPaul Mullowney /*@
10709ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1071e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1072e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1073e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1074e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1075e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
10769ae82921SPaul Mullowney 
10779ae82921SPaul Mullowney    Collective on MPI_Comm
10789ae82921SPaul Mullowney 
10799ae82921SPaul Mullowney    Input Parameters:
10809ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
10819ae82921SPaul Mullowney .  m - number of rows
10829ae82921SPaul Mullowney .  n - number of columns
10839ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
10849ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
10850298fd71SBarry Smith          (possibly different for each row) or NULL
10869ae82921SPaul Mullowney 
10879ae82921SPaul Mullowney    Output Parameter:
10889ae82921SPaul Mullowney .  A - the matrix
10899ae82921SPaul Mullowney 
10909ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
10919ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
10929ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
10939ae82921SPaul Mullowney 
10949ae82921SPaul Mullowney    Notes:
10959ae82921SPaul Mullowney    If nnz is given then nz is ignored
10969ae82921SPaul Mullowney 
10979ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
10989ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
10999ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
11009ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
11019ae82921SPaul Mullowney 
11029ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
11030298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11049ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
11059ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
11069ae82921SPaul Mullowney 
11079ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
11089ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
11099ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
11109ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
11119ae82921SPaul Mullowney 
11129ae82921SPaul Mullowney    Level: intermediate
11139ae82921SPaul Mullowney 
1114e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
11159ae82921SPaul Mullowney @*/
11169ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
11179ae82921SPaul Mullowney {
11189ae82921SPaul Mullowney   PetscErrorCode ierr;
11199ae82921SPaul Mullowney 
11209ae82921SPaul Mullowney   PetscFunctionBegin;
11219ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
11229ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
11239ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
11249ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11259ae82921SPaul Mullowney   PetscFunctionReturn(0);
11269ae82921SPaul Mullowney }
11279ae82921SPaul Mullowney 
11289ae82921SPaul Mullowney #undef __FUNCT__
11299ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
11309ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
11319ae82921SPaul Mullowney {
11329ae82921SPaul Mullowney   PetscErrorCode     ierr;
11339ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
11349ae82921SPaul Mullowney 
11359ae82921SPaul Mullowney   PetscFunctionBegin;
11369ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
11379ae82921SPaul Mullowney     try {
11389ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
11399ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
11409ae82921SPaul Mullowney       }
11412205254eSKarl Rupp       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
11429ae82921SPaul Mullowney       delete cusparseMat;
11439ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
11449ae82921SPaul Mullowney     } catch(char *ex) {
11459ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
11469ae82921SPaul Mullowney     }
11479ae82921SPaul Mullowney   } else {
1148e057df02SPaul Mullowney     /* The triangular factors */
11499ae82921SPaul Mullowney     try {
11509ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
11519ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
11529ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
11539ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatLo;
11549ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatUp;
11559ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
11569ae82921SPaul Mullowney       delete cusparseTriFactors;
11579ae82921SPaul Mullowney     } catch(char *ex) {
11589ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
11599ae82921SPaul Mullowney     }
11609ae82921SPaul Mullowney   }
11619ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
11629ae82921SPaul Mullowney     cusparseStatus_t stat;
11639ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
11642205254eSKarl Rupp 
11659ae82921SPaul Mullowney     MAT_cusparseHandle=0;
11669ae82921SPaul Mullowney   }
11679ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
11689ae82921SPaul Mullowney   A->spptr = 0;
11699ae82921SPaul Mullowney 
11709ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
11719ae82921SPaul Mullowney   PetscFunctionReturn(0);
11729ae82921SPaul Mullowney }
11739ae82921SPaul Mullowney 
11749ae82921SPaul Mullowney #undef __FUNCT__
11759ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
11768cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
11779ae82921SPaul Mullowney {
11789ae82921SPaul Mullowney   PetscErrorCode ierr;
11799ae82921SPaul Mullowney 
11809ae82921SPaul Mullowney   PetscFunctionBegin;
11819ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
11829ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
1183e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
1184e057df02SPaul Mullowney        now build a GPU matrix data structure */
11859ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
11869ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
11879ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
1188e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1189bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
1190087f3262SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE;
11919ae82921SPaul Mullowney   } else {
11929ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
1193debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
11949ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
11959ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
11969ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
1197e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
1198bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
1199087f3262SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm   = PETSC_FALSE;
12009ae82921SPaul Mullowney   }
1201e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
12029ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
12039ae82921SPaul Mullowney     cusparseStatus_t stat;
12049ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
12059ae82921SPaul Mullowney   }
1206e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1207e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
1208e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
120900de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
12102205254eSKarl Rupp 
12119ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
12129ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
12139ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
12149ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1215ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1216ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1217ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1218ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
12192205254eSKarl Rupp 
12209ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
12212205254eSKarl Rupp 
12229ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
12232205254eSKarl Rupp 
122400de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
12259ae82921SPaul Mullowney   PetscFunctionReturn(0);
12269ae82921SPaul Mullowney }
12279ae82921SPaul Mullowney 
1228e057df02SPaul Mullowney /*M
1229e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1230e057df02SPaul Mullowney 
1231e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1232e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1233e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1234e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1235e057df02SPaul Mullowney    the different GPU storage formats.
1236e057df02SPaul Mullowney 
1237e057df02SPaul Mullowney    Options Database Keys:
1238e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
12398468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
12408468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
12418468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1242e057df02SPaul Mullowney 
1243e057df02SPaul Mullowney   Level: beginner
1244e057df02SPaul Mullowney 
12458468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1246e057df02SPaul Mullowney M*/
1247