xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 00de8ff0695ff394d09a2c60082aeaab5870b6e2)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
15e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
169ae82921SPaul Mullowney 
17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
18e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
20e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
21e057df02SPaul Mullowney    and reuse it. */
22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
319ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
32e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
338dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
348dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
358dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
368dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
379ae82921SPaul Mullowney 
389ae82921SPaul Mullowney #undef __FUNCT__
399ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
409ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
419ae82921SPaul Mullowney {
429ae82921SPaul Mullowney   PetscFunctionBegin;
439ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
449ae82921SPaul Mullowney   PetscFunctionReturn(0);
459ae82921SPaul Mullowney }
469ae82921SPaul Mullowney 
479ae82921SPaul Mullowney EXTERN_C_BEGIN
489ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
499ae82921SPaul Mullowney EXTERN_C_END
50c708e6cdSJed Brown /*MC
51c708e6cdSJed Brown   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices on a single GPU of type,
52c708e6cdSJed Brown   seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported algorithms are ILU(k) and LU. ICC(k) and
53c708e6cdSJed Brown   Cholesky will be supported in future versions. This class does NOT support direct solver operations.
54c708e6cdSJed Brown 
55c708e6cdSJed Brown   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE
56c708e6cdSJed Brown 
57c708e6cdSJed Brown   Consult CUSPARSE documentation for more information about the matrix storage formats which correspond to the options
58c708e6cdSJed Brown   database keys below.
59c708e6cdSJed Brown 
60c708e6cdSJed Brown   Options Database Keys:
61c708e6cdSJed Brown .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
629ae82921SPaul Mullowney 
639ae82921SPaul Mullowney   Level: beginner
64c708e6cdSJed Brown 
65c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
66c708e6cdSJed Brown M*/
679ae82921SPaul Mullowney 
689ae82921SPaul Mullowney EXTERN_C_BEGIN
699ae82921SPaul Mullowney #undef __FUNCT__
709ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
719ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
729ae82921SPaul Mullowney {
739ae82921SPaul Mullowney   PetscErrorCode ierr;
749ae82921SPaul Mullowney 
759ae82921SPaul Mullowney   PetscFunctionBegin;
769ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
779ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
789ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
799ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
80*00de8ff0SBarry Smith     ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
812205254eSKarl Rupp 
829ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
839ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
849ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
859ae82921SPaul Mullowney   (*B)->factortype = ftype;
869ae82921SPaul Mullowney   PetscFunctionReturn(0);
879ae82921SPaul Mullowney }
889ae82921SPaul Mullowney EXTERN_C_END
899ae82921SPaul Mullowney 
90e057df02SPaul Mullowney EXTERN_C_BEGIN
919ae82921SPaul Mullowney #undef __FUNCT__
92e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
93e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
94ca45077fSPaul Mullowney {
95ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
966e111a19SKarl Rupp 
97ca45077fSPaul Mullowney   PetscFunctionBegin;
98ca45077fSPaul Mullowney   switch (op) {
99e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
100e057df02SPaul Mullowney     cusparseMat->format = format;
101ca45077fSPaul Mullowney     break;
102e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
103e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
104ca45077fSPaul Mullowney     break;
105e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
106e057df02SPaul Mullowney     cusparseMat->format           = format;
107e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
108ca45077fSPaul Mullowney     break;
109ca45077fSPaul Mullowney   default:
110e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
111ca45077fSPaul Mullowney   }
112ca45077fSPaul Mullowney   PetscFunctionReturn(0);
113ca45077fSPaul Mullowney }
114e057df02SPaul Mullowney EXTERN_C_END
1159ae82921SPaul Mullowney 
116e057df02SPaul Mullowney 
117e057df02SPaul Mullowney /*@
118e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
119e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
1208468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
121e057df02SPaul Mullowney    to build/install PETSc to use this package.
122e057df02SPaul Mullowney 
123e057df02SPaul Mullowney    Not Collective
124e057df02SPaul Mullowney 
125e057df02SPaul Mullowney    Input Parameters:
1268468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1278468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1288468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
129e057df02SPaul Mullowney 
130e057df02SPaul Mullowney    Output Parameter:
131e057df02SPaul Mullowney 
132e057df02SPaul Mullowney    Level: intermediate
133e057df02SPaul Mullowney 
1348468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
135e057df02SPaul Mullowney @*/
136e057df02SPaul Mullowney #undef __FUNCT__
137e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
138e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
139e057df02SPaul Mullowney {
140e057df02SPaul Mullowney   PetscErrorCode ierr;
1416e111a19SKarl Rupp 
142e057df02SPaul Mullowney   PetscFunctionBegin;
143e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
144e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
145e057df02SPaul Mullowney   PetscFunctionReturn(0);
146e057df02SPaul Mullowney }
147e057df02SPaul Mullowney 
1489ae82921SPaul Mullowney #undef __FUNCT__
1499ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1509ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1519ae82921SPaul Mullowney {
1529ae82921SPaul Mullowney   PetscErrorCode           ierr;
153e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1549ae82921SPaul Mullowney   PetscBool                flg;
1556e111a19SKarl Rupp 
1569ae82921SPaul Mullowney   PetscFunctionBegin;
157e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
158e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1599ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
160e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1617083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
162e057df02SPaul Mullowney     if (flg) {
163e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
164045c96e1SPaul Mullowney     }
1652205254eSKarl Rupp   } else {
166e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1677083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
168e057df02SPaul Mullowney     if (flg) {
169e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
170e057df02SPaul Mullowney     }
1719ae82921SPaul Mullowney   }
1724c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1737083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1744c87dfd4SPaul Mullowney   if (flg) {
1754c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1764c87dfd4SPaul Mullowney   }
1779ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1789ae82921SPaul Mullowney   PetscFunctionReturn(0);
1799ae82921SPaul Mullowney 
1809ae82921SPaul Mullowney }
1819ae82921SPaul Mullowney 
1829ae82921SPaul Mullowney #undef __FUNCT__
1839ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1849ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1859ae82921SPaul Mullowney {
1869ae82921SPaul Mullowney   PetscErrorCode ierr;
1879ae82921SPaul Mullowney 
1889ae82921SPaul Mullowney   PetscFunctionBegin;
1899ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1902205254eSKarl Rupp 
1919ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1929ae82921SPaul Mullowney   PetscFunctionReturn(0);
1939ae82921SPaul Mullowney }
1949ae82921SPaul Mullowney 
1959ae82921SPaul Mullowney #undef __FUNCT__
1969ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1979ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1989ae82921SPaul Mullowney {
1999ae82921SPaul Mullowney   PetscErrorCode ierr;
2009ae82921SPaul Mullowney 
2019ae82921SPaul Mullowney   PetscFunctionBegin;
2029ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2032205254eSKarl Rupp 
2049ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2059ae82921SPaul Mullowney   PetscFunctionReturn(0);
2069ae82921SPaul Mullowney }
2079ae82921SPaul Mullowney 
2089ae82921SPaul Mullowney #undef __FUNCT__
209e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
210e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
2119ae82921SPaul Mullowney {
2129ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2139ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2149ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2159ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2169ae82921SPaul Mullowney   cusparseStatus_t             stat;
2179ae82921SPaul Mullowney   const PetscInt               *ai = a->i,*aj = a->j,*vi;
2189ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2199ae82921SPaul Mullowney   PetscErrorCode               ierr;
2209ae82921SPaul Mullowney   PetscInt                     *AiLo, *AjLo;
2219ae82921SPaul Mullowney   PetscScalar                  *AALo;
2229ae82921SPaul Mullowney   PetscInt                     i,nz, nzLower, offset, rowOffset;
2239ae82921SPaul Mullowney 
2249ae82921SPaul Mullowney   PetscFunctionBegin;
2259ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2269ae82921SPaul Mullowney     try {
2279ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2289ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2299ae82921SPaul Mullowney 
2309ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2319ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2329ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2339ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2349ae82921SPaul Mullowney 
2359ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2369ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2379ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2389ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2399ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2409ae82921SPaul Mullowney       v        = aa;
2419ae82921SPaul Mullowney       vi       = aj;
2429ae82921SPaul Mullowney       offset   = 1;
2439ae82921SPaul Mullowney       rowOffset= 1;
2449ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2459ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
246e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2479ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2489ae82921SPaul Mullowney         rowOffset += nz+1;
2499ae82921SPaul Mullowney 
2509ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2519ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2529ae82921SPaul Mullowney 
2539ae82921SPaul Mullowney         offset      += nz;
2549ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2559ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2569ae82921SPaul Mullowney         offset      += 1;
2579ae82921SPaul Mullowney 
2589ae82921SPaul Mullowney         v  += nz;
2599ae82921SPaul Mullowney         vi += nz;
2609ae82921SPaul Mullowney       }
261e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2622205254eSKarl Rupp 
263bda325fcSPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2649ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2659ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2662205254eSKarl Rupp 
2679ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2682205254eSKarl Rupp 
2699ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2709ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2719ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2729ae82921SPaul Mullowney     } catch(char *ex) {
2739ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2749ae82921SPaul Mullowney     }
2759ae82921SPaul Mullowney   }
2769ae82921SPaul Mullowney   PetscFunctionReturn(0);
2779ae82921SPaul Mullowney }
2789ae82921SPaul Mullowney 
2799ae82921SPaul Mullowney #undef __FUNCT__
280e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
281e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
2829ae82921SPaul Mullowney {
2839ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2849ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2859ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2869ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2879ae82921SPaul Mullowney   cusparseStatus_t             stat;
2889ae82921SPaul Mullowney   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
2899ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2909ae82921SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
2919ae82921SPaul Mullowney   PetscScalar                  *AAUp;
2929ae82921SPaul Mullowney   PetscInt                     i,nz, nzUpper, offset;
2939ae82921SPaul Mullowney   PetscErrorCode               ierr;
2949ae82921SPaul Mullowney 
2959ae82921SPaul Mullowney   PetscFunctionBegin;
2969ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2979ae82921SPaul Mullowney     try {
2989ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
2999ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3009ae82921SPaul Mullowney 
3019ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3029ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3039ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3049ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3059ae82921SPaul Mullowney 
3069ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3079ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3089ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3099ae82921SPaul Mullowney       offset = nzUpper;
3109ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3119ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3129ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3139ae82921SPaul Mullowney 
314e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3159ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3169ae82921SPaul Mullowney 
317e057df02SPaul Mullowney         /* decrement the offset */
3189ae82921SPaul Mullowney         offset -= (nz+1);
3199ae82921SPaul Mullowney 
320e057df02SPaul Mullowney         /* first, set the diagonal elements */
3219ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3229ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3239ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3249ae82921SPaul Mullowney 
3259ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3269ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3279ae82921SPaul Mullowney       }
328e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3292205254eSKarl Rupp 
330bda325fcSPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3319ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3329ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3332205254eSKarl Rupp 
3349ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3352205254eSKarl Rupp 
3369ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3379ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3389ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3399ae82921SPaul Mullowney     } catch(char *ex) {
3409ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3419ae82921SPaul Mullowney     }
3429ae82921SPaul Mullowney   }
3439ae82921SPaul Mullowney   PetscFunctionReturn(0);
3449ae82921SPaul Mullowney }
3459ae82921SPaul Mullowney 
3469ae82921SPaul Mullowney #undef __FUNCT__
347a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
348e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
3499ae82921SPaul Mullowney {
3509ae82921SPaul Mullowney   PetscErrorCode               ierr;
3519ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3529ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3539ae82921SPaul Mullowney   IS                           isrow               = a->row,iscol = a->icol;
3549ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
3559ae82921SPaul Mullowney   const PetscInt               *r,*c;
3569ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney   PetscFunctionBegin;
359e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
360e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3612205254eSKarl Rupp 
3629ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3639ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3649ae82921SPaul Mullowney 
3659ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
366e057df02SPaul Mullowney   /*lower triangular indices */
3679ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3689ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3692205254eSKarl Rupp   if (!row_identity) {
3709ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3712205254eSKarl Rupp   }
3729ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3739ae82921SPaul Mullowney 
374e057df02SPaul Mullowney   /*upper triangular indices */
3759ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3769ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3772205254eSKarl Rupp   if (!col_identity) {
3789ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3792205254eSKarl Rupp   }
3809ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3819ae82921SPaul Mullowney   PetscFunctionReturn(0);
3829ae82921SPaul Mullowney }
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney #undef __FUNCT__
3859ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3869ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3879ae82921SPaul Mullowney {
3889ae82921SPaul Mullowney   PetscErrorCode ierr;
3899ae82921SPaul Mullowney   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
3909ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
3919ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
3929ae82921SPaul Mullowney 
3939ae82921SPaul Mullowney   PetscFunctionBegin;
3949ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
395e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
3969ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3979ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
398bda325fcSPaul Mullowney   if (row_identity && col_identity) {
399bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
400bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
401bda325fcSPaul Mullowney   } else {
402bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
403bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
404bda325fcSPaul Mullowney   }
4058dc1d2a3SPaul Mullowney 
406e057df02SPaul Mullowney   /* get the triangular factors */
407e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
4089ae82921SPaul Mullowney   PetscFunctionReturn(0);
4099ae82921SPaul Mullowney }
4109ae82921SPaul Mullowney 
4119ae82921SPaul Mullowney 
412bda325fcSPaul Mullowney #undef __FUNCT__
413bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
414bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
415bda325fcSPaul Mullowney {
416bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419bda325fcSPaul Mullowney   cusparseStatus_t stat;
420bda325fcSPaul Mullowney   PetscFunctionBegin;
421bda325fcSPaul Mullowney   stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle,
422bda325fcSPaul Mullowney                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
423bda325fcSPaul Mullowney                                               CUSPARSE_FILL_MODE_UPPER,
424bda325fcSPaul Mullowney                                               TRANSPOSE);CHKERRCUSP(stat);
425bda325fcSPaul Mullowney   stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
426bda325fcSPaul Mullowney 
427bda325fcSPaul Mullowney   stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle,
428bda325fcSPaul Mullowney                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
429bda325fcSPaul Mullowney                                               CUSPARSE_FILL_MODE_LOWER,
430bda325fcSPaul Mullowney                                               TRANSPOSE);CHKERRCUSP(stat);
431bda325fcSPaul Mullowney   stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
432bda325fcSPaul Mullowney   PetscFunctionReturn(0);
433bda325fcSPaul Mullowney }
434bda325fcSPaul Mullowney 
435bda325fcSPaul Mullowney #undef __FUNCT__
436bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
437bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
438bda325fcSPaul Mullowney {
439bda325fcSPaul Mullowney   PetscErrorCode     ierr;
440bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
441bda325fcSPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
442bda325fcSPaul Mullowney   cusparseStatus_t stat;
443bda325fcSPaul Mullowney   PetscFunctionBegin;
444bda325fcSPaul Mullowney   stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat);
445bda325fcSPaul Mullowney   ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr);
446bda325fcSPaul Mullowney   PetscFunctionReturn(0);
447bda325fcSPaul Mullowney }
448bda325fcSPaul Mullowney 
449bda325fcSPaul Mullowney #undef __FUNCT__
450bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
451bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
452bda325fcSPaul Mullowney {
453bda325fcSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
454bda325fcSPaul Mullowney   PetscErrorCode ierr;
455bda325fcSPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
456bda325fcSPaul Mullowney   cusparseStatus_t stat;
457bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
458bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
459bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
460bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
461bda325fcSPaul Mullowney 
462bda325fcSPaul Mullowney   PetscFunctionBegin;
463bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
464bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
465bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
466bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
467bda325fcSPaul Mullowney   }
468bda325fcSPaul Mullowney 
469bda325fcSPaul Mullowney   /* Get the GPU pointers */
470bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
471bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
472bda325fcSPaul Mullowney 
473bda325fcSPaul Mullowney   /* solve with reordering */
474bda325fcSPaul Mullowney   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
475bda325fcSPaul Mullowney   stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
476bda325fcSPaul Mullowney   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
477bda325fcSPaul Mullowney   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
478bda325fcSPaul Mullowney 
479bda325fcSPaul Mullowney   /* restore */
480bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
481bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
482bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
483bda325fcSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
484bda325fcSPaul Mullowney   PetscFunctionReturn(0);
485bda325fcSPaul Mullowney }
486bda325fcSPaul Mullowney 
487bda325fcSPaul Mullowney #undef __FUNCT__
488bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
489bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
490bda325fcSPaul Mullowney {
491bda325fcSPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
492bda325fcSPaul Mullowney   PetscErrorCode    ierr;
493bda325fcSPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
494bda325fcSPaul Mullowney   cusparseStatus_t stat;
495bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
496bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
497bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
498bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
499bda325fcSPaul Mullowney 
500bda325fcSPaul Mullowney   PetscFunctionBegin;
501bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
502bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
503bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
504bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
505bda325fcSPaul Mullowney   }
506bda325fcSPaul Mullowney 
507bda325fcSPaul Mullowney   /* Get the GPU pointers */
508bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
509bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
510bda325fcSPaul Mullowney 
511bda325fcSPaul Mullowney   /* solve */
512bda325fcSPaul Mullowney   stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
513bda325fcSPaul Mullowney   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
514bda325fcSPaul Mullowney 
515bda325fcSPaul Mullowney   /* restore */
516bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
517bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
518bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
519bda325fcSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
520bda325fcSPaul Mullowney   PetscFunctionReturn(0);
521bda325fcSPaul Mullowney }
522bda325fcSPaul Mullowney 
5239ae82921SPaul Mullowney 
5249ae82921SPaul Mullowney #undef __FUNCT__
5259ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
5269ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
5279ae82921SPaul Mullowney {
5289ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
5299ae82921SPaul Mullowney   PetscErrorCode               ierr;
530bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
5319ae82921SPaul Mullowney   cusparseStatus_t             stat;
5329ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5339ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
5349ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
5359ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
5369ae82921SPaul Mullowney 
5379ae82921SPaul Mullowney   PetscFunctionBegin;
538e057df02SPaul Mullowney   /* Get the GPU pointers */
5399ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5409ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
5419ae82921SPaul Mullowney 
542e057df02SPaul Mullowney   /* solve with reordering */
5439ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
5449ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
5459ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
5469ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
5479ae82921SPaul Mullowney 
5489ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
5499ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5509ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
5519ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5529ae82921SPaul Mullowney   PetscFunctionReturn(0);
5539ae82921SPaul Mullowney }
5549ae82921SPaul Mullowney 
5559ae82921SPaul Mullowney 
5569ae82921SPaul Mullowney 
5579ae82921SPaul Mullowney 
5589ae82921SPaul Mullowney #undef __FUNCT__
5599ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
5609ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
5619ae82921SPaul Mullowney {
5629ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
5639ae82921SPaul Mullowney   PetscErrorCode               ierr;
564bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
5659ae82921SPaul Mullowney   cusparseStatus_t             stat;
5669ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5679ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
5689ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
5699ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
5709ae82921SPaul Mullowney 
5719ae82921SPaul Mullowney   PetscFunctionBegin;
572e057df02SPaul Mullowney   /* Get the GPU pointers */
5739ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5749ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
5759ae82921SPaul Mullowney 
576e057df02SPaul Mullowney   /* solve */
5779ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
5789ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
5799ae82921SPaul Mullowney 
5809ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
5819ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5829ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
5839ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5849ae82921SPaul Mullowney   PetscFunctionReturn(0);
5859ae82921SPaul Mullowney }
5869ae82921SPaul Mullowney 
5879ae82921SPaul Mullowney #undef __FUNCT__
588e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
589e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
5909ae82921SPaul Mullowney {
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
5939ae82921SPaul Mullowney   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
5949ae82921SPaul Mullowney   PetscInt           m            = A->rmap->n,*ii,*ridx;
5959ae82921SPaul Mullowney   PetscErrorCode     ierr;
5969ae82921SPaul Mullowney 
5979ae82921SPaul Mullowney 
5989ae82921SPaul Mullowney   PetscFunctionBegin;
5999ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
6009ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
6019ae82921SPaul Mullowney     /*
6029ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
6039ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
6049ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
6059ae82921SPaul Mullowney     */
6069ae82921SPaul Mullowney     if (cusparseMat->mat) {
6079ae82921SPaul Mullowney       try {
6089ae82921SPaul Mullowney         delete cusparseMat->mat;
6092205254eSKarl Rupp         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
6109ae82921SPaul Mullowney 
6119ae82921SPaul Mullowney       } catch(char *ex) {
6129ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6139ae82921SPaul Mullowney       }
6149ae82921SPaul Mullowney     }
6159ae82921SPaul Mullowney     try {
6169ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
6172205254eSKarl Rupp       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
6189ae82921SPaul Mullowney 
6199ae82921SPaul Mullowney       if (a->compressedrow.use) {
6209ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
6219ae82921SPaul Mullowney         ii   = a->compressedrow.i;
6229ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
6239ae82921SPaul Mullowney       } else {
624e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
6259ae82921SPaul Mullowney         int k=0;
6269ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
6279ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
6289ae82921SPaul Mullowney         ii[0]=0;
6299ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
6309ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
6319ae82921SPaul Mullowney             ii[k]  = a->i[j];
6329ae82921SPaul Mullowney             ridx[k]= j;
6339ae82921SPaul Mullowney             k++;
6349ae82921SPaul Mullowney           }
6359ae82921SPaul Mullowney         }
6369ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
6372205254eSKarl Rupp 
6389ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
6399ae82921SPaul Mullowney       }
6409ae82921SPaul Mullowney 
641e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
642e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
6439ae82921SPaul Mullowney 
644e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
6459ae82921SPaul Mullowney       PetscMPIInt size;
6469ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
6479ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
6489ae82921SPaul Mullowney 
649e057df02SPaul Mullowney       /* FILL MODE UPPER is irrelevant */
650bda325fcSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
651ca45077fSPaul Mullowney 
652e057df02SPaul Mullowney       /* lastly, build the matrix */
6539ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
6549ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
6559ae82921SPaul Mullowney       if (!a->compressedrow.use) {
6569ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
6579ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
6589ae82921SPaul Mullowney       }
6599ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
6609ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
6619ae82921SPaul Mullowney     } catch(char *ex) {
6629ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6639ae82921SPaul Mullowney     }
6649ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
6652205254eSKarl Rupp 
6669ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
6672205254eSKarl Rupp 
6689ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
6699ae82921SPaul Mullowney   }
6709ae82921SPaul Mullowney   PetscFunctionReturn(0);
6719ae82921SPaul Mullowney }
6729ae82921SPaul Mullowney 
6739ae82921SPaul Mullowney #undef __FUNCT__
6749ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
6759ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
6769ae82921SPaul Mullowney {
6779ae82921SPaul Mullowney   PetscErrorCode ierr;
6789ae82921SPaul Mullowney 
6799ae82921SPaul Mullowney   PetscFunctionBegin;
6809ae82921SPaul Mullowney   if (right) {
681ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
6829ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6839ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
6849ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
6859ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
6869ae82921SPaul Mullowney   }
6879ae82921SPaul Mullowney   if (left) {
688ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
6899ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6909ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
6919ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
6929ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
6939ae82921SPaul Mullowney   }
6949ae82921SPaul Mullowney   PetscFunctionReturn(0);
6959ae82921SPaul Mullowney }
6969ae82921SPaul Mullowney 
6979ae82921SPaul Mullowney #undef __FUNCT__
6989ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
6999ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
7009ae82921SPaul Mullowney {
7019ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
7029ae82921SPaul Mullowney   PetscErrorCode     ierr;
7039ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
704bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
7059ae82921SPaul Mullowney 
7069ae82921SPaul Mullowney   PetscFunctionBegin;
707e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
708e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
7099ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7109ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
7119ae82921SPaul Mullowney   try {
7129ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
7139ae82921SPaul Mullowney   } catch (char *ex) {
7149ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7159ae82921SPaul Mullowney   }
7169ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7179ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
718ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
7199ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
720ca45077fSPaul Mullowney   }
7219ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
7229ae82921SPaul Mullowney   PetscFunctionReturn(0);
7239ae82921SPaul Mullowney }
7249ae82921SPaul Mullowney 
7259ae82921SPaul Mullowney 
7269ae82921SPaul Mullowney #undef __FUNCT__
727ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
728ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
729ca45077fSPaul Mullowney {
730ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
731ca45077fSPaul Mullowney   PetscErrorCode     ierr;
732ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
733bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
734ca45077fSPaul Mullowney 
735ca45077fSPaul Mullowney   PetscFunctionBegin;
736e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
737e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
738bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
739bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
740bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
741bda325fcSPaul Mullowney   }
742ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
743ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
744ca45077fSPaul Mullowney   try {
745ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
746ca45077fSPaul Mullowney   } catch (char *ex) {
747ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
748ca45077fSPaul Mullowney   }
749ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
750ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
751ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
752ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
753ca45077fSPaul Mullowney   }
754ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
755ca45077fSPaul Mullowney   PetscFunctionReturn(0);
756ca45077fSPaul Mullowney }
757ca45077fSPaul Mullowney 
758ca45077fSPaul Mullowney #undef __FUNCT__
7599ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
7609ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
7619ae82921SPaul Mullowney {
7629ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
7639ae82921SPaul Mullowney   PetscErrorCode     ierr;
7649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
765bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
7666e111a19SKarl Rupp 
7679ae82921SPaul Mullowney   PetscFunctionBegin;
768e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
769e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
7709ae82921SPaul Mullowney   try {
7719ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
7729ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7739ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
7749ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
7759ae82921SPaul Mullowney 
776e057df02SPaul Mullowney     /* multiply add */
7779ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
7789ae82921SPaul Mullowney 
7799ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7809ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
7819ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
7829ae82921SPaul Mullowney 
7839ae82921SPaul Mullowney   } catch(char *ex) {
7849ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7859ae82921SPaul Mullowney   }
7869ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
7879ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
7889ae82921SPaul Mullowney   PetscFunctionReturn(0);
7899ae82921SPaul Mullowney }
7909ae82921SPaul Mullowney 
7919ae82921SPaul Mullowney #undef __FUNCT__
792ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
793ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
794ca45077fSPaul Mullowney {
795ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
796ca45077fSPaul Mullowney   PetscErrorCode     ierr;
797ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
798ca45077fSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
7996e111a19SKarl Rupp 
800ca45077fSPaul Mullowney   PetscFunctionBegin;
801e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
802e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
803bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
804bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
805bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
806bda325fcSPaul Mullowney   }
807ca45077fSPaul Mullowney   try {
808ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
809ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
810ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
811ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
812ca45077fSPaul Mullowney 
813e057df02SPaul Mullowney     /* multiply add with matrix transpose */
814ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
815ca45077fSPaul Mullowney 
816ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
817ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
818ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
819ca45077fSPaul Mullowney 
820ca45077fSPaul Mullowney   } catch(char *ex) {
821ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
822ca45077fSPaul Mullowney   }
823ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
824ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
825ca45077fSPaul Mullowney   PetscFunctionReturn(0);
826ca45077fSPaul Mullowney }
827ca45077fSPaul Mullowney 
828ca45077fSPaul Mullowney #undef __FUNCT__
8299ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
8309ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
8319ae82921SPaul Mullowney {
8329ae82921SPaul Mullowney   PetscErrorCode ierr;
8336e111a19SKarl Rupp 
8349ae82921SPaul Mullowney   PetscFunctionBegin;
8359ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
836e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8379ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
838bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
839bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
840bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
841bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
8429ae82921SPaul Mullowney   PetscFunctionReturn(0);
8439ae82921SPaul Mullowney }
8449ae82921SPaul Mullowney 
8459ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
8469ae82921SPaul Mullowney #undef __FUNCT__
8479ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
848e057df02SPaul Mullowney /*@
8499ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
850e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
851e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
852e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
853e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
854e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
8559ae82921SPaul Mullowney 
8569ae82921SPaul Mullowney    Collective on MPI_Comm
8579ae82921SPaul Mullowney 
8589ae82921SPaul Mullowney    Input Parameters:
8599ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
8609ae82921SPaul Mullowney .  m - number of rows
8619ae82921SPaul Mullowney .  n - number of columns
8629ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
8639ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
8640298fd71SBarry Smith          (possibly different for each row) or NULL
8659ae82921SPaul Mullowney 
8669ae82921SPaul Mullowney    Output Parameter:
8679ae82921SPaul Mullowney .  A - the matrix
8689ae82921SPaul Mullowney 
8699ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8709ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8719ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8729ae82921SPaul Mullowney 
8739ae82921SPaul Mullowney    Notes:
8749ae82921SPaul Mullowney    If nnz is given then nz is ignored
8759ae82921SPaul Mullowney 
8769ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
8779ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
8789ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
8799ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
8809ae82921SPaul Mullowney 
8819ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
8820298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
8839ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
8849ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
8859ae82921SPaul Mullowney 
8869ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
8879ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
8889ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
8899ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
8909ae82921SPaul Mullowney 
8919ae82921SPaul Mullowney    Level: intermediate
8929ae82921SPaul Mullowney 
893e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
8949ae82921SPaul Mullowney @*/
8959ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8969ae82921SPaul Mullowney {
8979ae82921SPaul Mullowney   PetscErrorCode ierr;
8989ae82921SPaul Mullowney 
8999ae82921SPaul Mullowney   PetscFunctionBegin;
9009ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
9019ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
9029ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
9039ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
9049ae82921SPaul Mullowney   PetscFunctionReturn(0);
9059ae82921SPaul Mullowney }
9069ae82921SPaul Mullowney 
9079ae82921SPaul Mullowney #undef __FUNCT__
9089ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
9099ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
9109ae82921SPaul Mullowney {
9119ae82921SPaul Mullowney   PetscErrorCode     ierr;
9129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
9139ae82921SPaul Mullowney 
9149ae82921SPaul Mullowney   PetscFunctionBegin;
9159ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
9169ae82921SPaul Mullowney     try {
9179ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
9189ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
9199ae82921SPaul Mullowney       }
9202205254eSKarl Rupp       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
9219ae82921SPaul Mullowney       delete cusparseMat;
9229ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9239ae82921SPaul Mullowney     } catch(char *ex) {
9249ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9259ae82921SPaul Mullowney     }
9269ae82921SPaul Mullowney   } else {
927e057df02SPaul Mullowney     /* The triangular factors */
9289ae82921SPaul Mullowney     try {
9299ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
9309ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
9319ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
9329ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatLo;
9339ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatUp;
9349ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
9359ae82921SPaul Mullowney       delete cusparseTriFactors;
9369ae82921SPaul Mullowney     } catch(char *ex) {
9379ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9389ae82921SPaul Mullowney     }
9399ae82921SPaul Mullowney   }
9409ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
9419ae82921SPaul Mullowney     cusparseStatus_t stat;
9429ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
9432205254eSKarl Rupp 
9449ae82921SPaul Mullowney     MAT_cusparseHandle=0;
9459ae82921SPaul Mullowney   }
9469ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
9479ae82921SPaul Mullowney   A->spptr = 0;
9489ae82921SPaul Mullowney 
9499ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
9509ae82921SPaul Mullowney   PetscFunctionReturn(0);
9519ae82921SPaul Mullowney }
9529ae82921SPaul Mullowney 
9539ae82921SPaul Mullowney EXTERN_C_BEGIN
9549ae82921SPaul Mullowney #undef __FUNCT__
9559ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
9569ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
9579ae82921SPaul Mullowney {
9589ae82921SPaul Mullowney   PetscErrorCode ierr;
9599ae82921SPaul Mullowney 
9609ae82921SPaul Mullowney   PetscFunctionBegin;
9619ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
9629ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
963e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
964e057df02SPaul Mullowney        now build a GPU matrix data structure */
9659ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
9662205254eSKarl Rupp 
9679ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
9689ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
969e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
970bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
9719ae82921SPaul Mullowney   } else {
9729ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
973debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
9742205254eSKarl Rupp 
9759ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
9769ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
9779ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
978e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
979bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
9809ae82921SPaul Mullowney   }
981e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
9829ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
9839ae82921SPaul Mullowney     cusparseStatus_t stat;
9849ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
9859ae82921SPaul Mullowney   }
986e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
987e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
988e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
989*00de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
9902205254eSKarl Rupp 
9919ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
9929ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
9939ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
9949ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
995ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
996ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
997ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
998ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
9992205254eSKarl Rupp 
10009ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
10012205254eSKarl Rupp 
10029ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
10032205254eSKarl Rupp 
1004*00de8ff0SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
10059ae82921SPaul Mullowney   PetscFunctionReturn(0);
10069ae82921SPaul Mullowney }
10079ae82921SPaul Mullowney EXTERN_C_END
10089ae82921SPaul Mullowney 
1009e057df02SPaul Mullowney /*M
1010e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1011e057df02SPaul Mullowney 
1012e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1013e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1014e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1015e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1016e057df02SPaul Mullowney    the different GPU storage formats.
1017e057df02SPaul Mullowney 
1018e057df02SPaul Mullowney    Options Database Keys:
1019e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
10208468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
10218468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
10228468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1023e057df02SPaul Mullowney 
1024e057df02SPaul Mullowney   Level: beginner
1025e057df02SPaul Mullowney 
10268468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1027e057df02SPaul Mullowney M*/
1028