xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ce94432eddcd14845bc7e8083b7f8ea723b9bf7d)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
15e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
169ae82921SPaul Mullowney 
17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
18e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
20e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
21e057df02SPaul Mullowney    and reuse it. */
22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
319ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
32e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
338dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
348dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
358dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
368dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
379ae82921SPaul Mullowney 
389ae82921SPaul Mullowney #undef __FUNCT__
399ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
409ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
419ae82921SPaul Mullowney {
429ae82921SPaul Mullowney   PetscFunctionBegin;
439ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
449ae82921SPaul Mullowney   PetscFunctionReturn(0);
459ae82921SPaul Mullowney }
469ae82921SPaul Mullowney 
479ae82921SPaul Mullowney EXTERN_C_BEGIN
489ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
499ae82921SPaul Mullowney EXTERN_C_END
50e057df02SPaul Mullowney /*
519ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
529ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
539ae82921SPaul Mullowney 
549ae82921SPaul Mullowney    Level: beginner
55e057df02SPaul Mullowney */
569ae82921SPaul Mullowney 
579ae82921SPaul Mullowney EXTERN_C_BEGIN
589ae82921SPaul Mullowney #undef __FUNCT__
599ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
609ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
619ae82921SPaul Mullowney {
629ae82921SPaul Mullowney   PetscErrorCode ierr;
639ae82921SPaul Mullowney 
649ae82921SPaul Mullowney   PetscFunctionBegin;
659ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
669ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
679ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
689ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
699ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
702205254eSKarl Rupp 
719ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
729ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
739ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
749ae82921SPaul Mullowney   (*B)->factortype = ftype;
759ae82921SPaul Mullowney   PetscFunctionReturn(0);
769ae82921SPaul Mullowney }
779ae82921SPaul Mullowney EXTERN_C_END
789ae82921SPaul Mullowney 
79e057df02SPaul Mullowney EXTERN_C_BEGIN
809ae82921SPaul Mullowney #undef __FUNCT__
81e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
82e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
83ca45077fSPaul Mullowney {
84ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
856e111a19SKarl Rupp 
86ca45077fSPaul Mullowney   PetscFunctionBegin;
87ca45077fSPaul Mullowney   switch (op) {
88e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
89e057df02SPaul Mullowney     cusparseMat->format = format;
90ca45077fSPaul Mullowney     break;
91e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
92e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
93ca45077fSPaul Mullowney     break;
94e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
95e057df02SPaul Mullowney     cusparseMat->format           = format;
96e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
97ca45077fSPaul Mullowney     break;
98ca45077fSPaul Mullowney   default:
99e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
100ca45077fSPaul Mullowney   }
101ca45077fSPaul Mullowney   PetscFunctionReturn(0);
102ca45077fSPaul Mullowney }
103e057df02SPaul Mullowney EXTERN_C_END
1049ae82921SPaul Mullowney 
105e057df02SPaul Mullowney 
106e057df02SPaul Mullowney /*@
107e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
108e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
1098468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
110e057df02SPaul Mullowney    to build/install PETSc to use this package.
111e057df02SPaul Mullowney 
112e057df02SPaul Mullowney    Not Collective
113e057df02SPaul Mullowney 
114e057df02SPaul Mullowney    Input Parameters:
1158468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1168468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1178468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
118e057df02SPaul Mullowney 
119e057df02SPaul Mullowney    Output Parameter:
120e057df02SPaul Mullowney 
121e057df02SPaul Mullowney    Level: intermediate
122e057df02SPaul Mullowney 
1238468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
124e057df02SPaul Mullowney @*/
125e057df02SPaul Mullowney #undef __FUNCT__
126e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
127e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
128e057df02SPaul Mullowney {
129e057df02SPaul Mullowney   PetscErrorCode ierr;
1306e111a19SKarl Rupp 
131e057df02SPaul Mullowney   PetscFunctionBegin;
132e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
133e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
134e057df02SPaul Mullowney   PetscFunctionReturn(0);
135e057df02SPaul Mullowney }
136e057df02SPaul Mullowney 
1379ae82921SPaul Mullowney #undef __FUNCT__
1389ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1399ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1409ae82921SPaul Mullowney {
1419ae82921SPaul Mullowney   PetscErrorCode           ierr;
142e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1439ae82921SPaul Mullowney   PetscBool                flg;
1446e111a19SKarl Rupp 
1459ae82921SPaul Mullowney   PetscFunctionBegin;
146e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
147e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1489ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
149e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1507083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
151e057df02SPaul Mullowney     if (flg) {
152e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
153045c96e1SPaul Mullowney     }
1542205254eSKarl Rupp   } else {
155e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1567083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
157e057df02SPaul Mullowney     if (flg) {
158e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
159e057df02SPaul Mullowney     }
1609ae82921SPaul Mullowney   }
1614c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1627083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1634c87dfd4SPaul Mullowney   if (flg) {
1644c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1654c87dfd4SPaul Mullowney   }
1669ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1679ae82921SPaul Mullowney   PetscFunctionReturn(0);
1689ae82921SPaul Mullowney 
1699ae82921SPaul Mullowney }
1709ae82921SPaul Mullowney 
1719ae82921SPaul Mullowney #undef __FUNCT__
1729ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1739ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1749ae82921SPaul Mullowney {
1759ae82921SPaul Mullowney   PetscErrorCode ierr;
1769ae82921SPaul Mullowney 
1779ae82921SPaul Mullowney   PetscFunctionBegin;
1789ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1792205254eSKarl Rupp 
1809ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1819ae82921SPaul Mullowney   PetscFunctionReturn(0);
1829ae82921SPaul Mullowney }
1839ae82921SPaul Mullowney 
1849ae82921SPaul Mullowney #undef __FUNCT__
1859ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1869ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1879ae82921SPaul Mullowney {
1889ae82921SPaul Mullowney   PetscErrorCode ierr;
1899ae82921SPaul Mullowney 
1909ae82921SPaul Mullowney   PetscFunctionBegin;
1919ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1922205254eSKarl Rupp 
1939ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1949ae82921SPaul Mullowney   PetscFunctionReturn(0);
1959ae82921SPaul Mullowney }
1969ae82921SPaul Mullowney 
1979ae82921SPaul Mullowney #undef __FUNCT__
198e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
199e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
2009ae82921SPaul Mullowney {
2019ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2029ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2039ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2049ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2059ae82921SPaul Mullowney   cusparseStatus_t             stat;
2069ae82921SPaul Mullowney   const PetscInt               *ai = a->i,*aj = a->j,*vi;
2079ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2089ae82921SPaul Mullowney   PetscErrorCode               ierr;
2099ae82921SPaul Mullowney   PetscInt                     *AiLo, *AjLo;
2109ae82921SPaul Mullowney   PetscScalar                  *AALo;
2119ae82921SPaul Mullowney   PetscInt                     i,nz, nzLower, offset, rowOffset;
2129ae82921SPaul Mullowney 
2139ae82921SPaul Mullowney   PetscFunctionBegin;
2149ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2159ae82921SPaul Mullowney     try {
2169ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2179ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2189ae82921SPaul Mullowney 
2199ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2209ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2219ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2229ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2239ae82921SPaul Mullowney 
2249ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2259ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2269ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2279ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2289ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2299ae82921SPaul Mullowney       v        = aa;
2309ae82921SPaul Mullowney       vi       = aj;
2319ae82921SPaul Mullowney       offset   = 1;
2329ae82921SPaul Mullowney       rowOffset= 1;
2339ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2349ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
235e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2369ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2379ae82921SPaul Mullowney         rowOffset += nz+1;
2389ae82921SPaul Mullowney 
2399ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2409ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2419ae82921SPaul Mullowney 
2429ae82921SPaul Mullowney         offset      += nz;
2439ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2449ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2459ae82921SPaul Mullowney         offset      += 1;
2469ae82921SPaul Mullowney 
2479ae82921SPaul Mullowney         v  += nz;
2489ae82921SPaul Mullowney         vi += nz;
2499ae82921SPaul Mullowney       }
250e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2512205254eSKarl Rupp 
252bda325fcSPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2539ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2549ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2552205254eSKarl Rupp 
2569ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2572205254eSKarl Rupp 
2589ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2599ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2609ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2619ae82921SPaul Mullowney     } catch(char *ex) {
2629ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2639ae82921SPaul Mullowney     }
2649ae82921SPaul Mullowney   }
2659ae82921SPaul Mullowney   PetscFunctionReturn(0);
2669ae82921SPaul Mullowney }
2679ae82921SPaul Mullowney 
2689ae82921SPaul Mullowney #undef __FUNCT__
269e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
270e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
2719ae82921SPaul Mullowney {
2729ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2739ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2749ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2759ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2769ae82921SPaul Mullowney   cusparseStatus_t             stat;
2779ae82921SPaul Mullowney   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
2789ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2799ae82921SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
2809ae82921SPaul Mullowney   PetscScalar                  *AAUp;
2819ae82921SPaul Mullowney   PetscInt                     i,nz, nzUpper, offset;
2829ae82921SPaul Mullowney   PetscErrorCode               ierr;
2839ae82921SPaul Mullowney 
2849ae82921SPaul Mullowney   PetscFunctionBegin;
2859ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2869ae82921SPaul Mullowney     try {
2879ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
2889ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
2919ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2929ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
2939ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
2949ae82921SPaul Mullowney 
2959ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
2969ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
2979ae82921SPaul Mullowney       AiUp[n]=nzUpper;
2989ae82921SPaul Mullowney       offset = nzUpper;
2999ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3009ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3019ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3029ae82921SPaul Mullowney 
303e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3049ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3059ae82921SPaul Mullowney 
306e057df02SPaul Mullowney         /* decrement the offset */
3079ae82921SPaul Mullowney         offset -= (nz+1);
3089ae82921SPaul Mullowney 
309e057df02SPaul Mullowney         /* first, set the diagonal elements */
3109ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3119ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3129ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3139ae82921SPaul Mullowney 
3149ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3159ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3169ae82921SPaul Mullowney       }
317e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3182205254eSKarl Rupp 
319bda325fcSPaul Mullowney       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3209ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3219ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3222205254eSKarl Rupp 
3239ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3242205254eSKarl Rupp 
3259ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3269ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3279ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3289ae82921SPaul Mullowney     } catch(char *ex) {
3299ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3309ae82921SPaul Mullowney     }
3319ae82921SPaul Mullowney   }
3329ae82921SPaul Mullowney   PetscFunctionReturn(0);
3339ae82921SPaul Mullowney }
3349ae82921SPaul Mullowney 
3359ae82921SPaul Mullowney #undef __FUNCT__
336a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
337e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
3389ae82921SPaul Mullowney {
3399ae82921SPaul Mullowney   PetscErrorCode               ierr;
3409ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3419ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3429ae82921SPaul Mullowney   IS                           isrow               = a->row,iscol = a->icol;
3439ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
3449ae82921SPaul Mullowney   const PetscInt               *r,*c;
3459ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
3469ae82921SPaul Mullowney 
3479ae82921SPaul Mullowney   PetscFunctionBegin;
348e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
349e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3502205254eSKarl Rupp 
3519ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3529ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3539ae82921SPaul Mullowney 
3549ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
355e057df02SPaul Mullowney   /*lower triangular indices */
3569ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3579ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3582205254eSKarl Rupp   if (!row_identity) {
3599ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3602205254eSKarl Rupp   }
3619ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3629ae82921SPaul Mullowney 
363e057df02SPaul Mullowney   /*upper triangular indices */
3649ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3659ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3662205254eSKarl Rupp   if (!col_identity) {
3679ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3682205254eSKarl Rupp   }
3699ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3709ae82921SPaul Mullowney   PetscFunctionReturn(0);
3719ae82921SPaul Mullowney }
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney #undef __FUNCT__
3749ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3759ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3769ae82921SPaul Mullowney {
3779ae82921SPaul Mullowney   PetscErrorCode ierr;
3789ae82921SPaul Mullowney   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
3799ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
3809ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
3819ae82921SPaul Mullowney 
3829ae82921SPaul Mullowney   PetscFunctionBegin;
3839ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
384e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
3859ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3869ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
387bda325fcSPaul Mullowney   if (row_identity && col_identity) {
388bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
389bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
390bda325fcSPaul Mullowney   } else {
391bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
392bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
393bda325fcSPaul Mullowney   }
3948dc1d2a3SPaul Mullowney 
395e057df02SPaul Mullowney   /* get the triangular factors */
396e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
3979ae82921SPaul Mullowney   PetscFunctionReturn(0);
3989ae82921SPaul Mullowney }
3999ae82921SPaul Mullowney 
4009ae82921SPaul Mullowney 
401bda325fcSPaul Mullowney #undef __FUNCT__
402bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
403bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
404bda325fcSPaul Mullowney {
405bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
406bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
407bda325fcSPaul Mullowney   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
408bda325fcSPaul Mullowney   cusparseStatus_t stat;
409bda325fcSPaul Mullowney   PetscFunctionBegin;
410bda325fcSPaul Mullowney   stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle,
411bda325fcSPaul Mullowney                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
412bda325fcSPaul Mullowney                                               CUSPARSE_FILL_MODE_UPPER,
413bda325fcSPaul Mullowney                                               TRANSPOSE);CHKERRCUSP(stat);
414bda325fcSPaul Mullowney   stat = cusparseMatLo->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
415bda325fcSPaul Mullowney 
416bda325fcSPaul Mullowney   stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle,
417bda325fcSPaul Mullowney                                               CUSPARSE_MATRIX_TYPE_TRIANGULAR,
418bda325fcSPaul Mullowney                                               CUSPARSE_FILL_MODE_LOWER,
419bda325fcSPaul Mullowney                                               TRANSPOSE);CHKERRCUSP(stat);
420bda325fcSPaul Mullowney   stat = cusparseMatUp->solveAnalysis(TRANSPOSE);CHKERRCUSP(stat);
421bda325fcSPaul Mullowney   PetscFunctionReturn(0);
422bda325fcSPaul Mullowney }
423bda325fcSPaul Mullowney 
424bda325fcSPaul Mullowney #undef __FUNCT__
425bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
426bda325fcSPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
427bda325fcSPaul Mullowney {
428bda325fcSPaul Mullowney   PetscErrorCode     ierr;
429bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
430bda325fcSPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
431bda325fcSPaul Mullowney   cusparseStatus_t stat;
432bda325fcSPaul Mullowney   PetscFunctionBegin;
433bda325fcSPaul Mullowney   stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, TRANSPOSE);CHKERRCUSP(stat);
434bda325fcSPaul Mullowney   ierr = cusparseMat->mat->setMatrix(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a, TRANSPOSE);CHKERRCUSP(ierr);
435bda325fcSPaul Mullowney   PetscFunctionReturn(0);
436bda325fcSPaul Mullowney }
437bda325fcSPaul Mullowney 
438bda325fcSPaul Mullowney #undef __FUNCT__
439bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
440bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
441bda325fcSPaul Mullowney {
442bda325fcSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
443bda325fcSPaul Mullowney   PetscErrorCode ierr;
444bda325fcSPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
445bda325fcSPaul Mullowney   cusparseStatus_t stat;
446bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
447bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
448bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
449bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
450bda325fcSPaul Mullowney 
451bda325fcSPaul Mullowney   PetscFunctionBegin;
452bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
453bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
454bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
455bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
456bda325fcSPaul Mullowney   }
457bda325fcSPaul Mullowney 
458bda325fcSPaul Mullowney   /* Get the GPU pointers */
459bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
460bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
461bda325fcSPaul Mullowney 
462bda325fcSPaul Mullowney   /* solve with reordering */
463bda325fcSPaul Mullowney   ierr = cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
464bda325fcSPaul Mullowney   stat = cusparseMatUp->solve(xGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
465bda325fcSPaul Mullowney   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
466bda325fcSPaul Mullowney   ierr = cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);
467bda325fcSPaul Mullowney 
468bda325fcSPaul Mullowney   /* restore */
469bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
470bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
471bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
472bda325fcSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
473bda325fcSPaul Mullowney   PetscFunctionReturn(0);
474bda325fcSPaul Mullowney }
475bda325fcSPaul Mullowney 
476bda325fcSPaul Mullowney #undef __FUNCT__
477bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
478bda325fcSPaul Mullowney PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
479bda325fcSPaul Mullowney {
480bda325fcSPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
481bda325fcSPaul Mullowney   PetscErrorCode    ierr;
482bda325fcSPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
483bda325fcSPaul Mullowney   cusparseStatus_t stat;
484bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
485bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
486bda325fcSPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
487bda325fcSPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
488bda325fcSPaul Mullowney 
489bda325fcSPaul Mullowney   PetscFunctionBegin;
490bda325fcSPaul Mullowney   /* Analyze the matrix ... on the fly */
491bda325fcSPaul Mullowney   if (!cusparseTriFactors->hasTranspose) {
492bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
493bda325fcSPaul Mullowney     cusparseTriFactors->hasTranspose=PETSC_TRUE;
494bda325fcSPaul Mullowney   }
495bda325fcSPaul Mullowney 
496bda325fcSPaul Mullowney   /* Get the GPU pointers */
497bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
498bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
499bda325fcSPaul Mullowney 
500bda325fcSPaul Mullowney   /* solve */
501bda325fcSPaul Mullowney   stat = cusparseMatUp->solve(bGPU, tempGPU, TRANSPOSE);CHKERRCUSP(stat);
502bda325fcSPaul Mullowney   stat = cusparseMatLo->solve(tempGPU, xGPU, TRANSPOSE);CHKERRCUSP(stat);
503bda325fcSPaul Mullowney 
504bda325fcSPaul Mullowney   /* restore */
505bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
506bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
507bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
508bda325fcSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
509bda325fcSPaul Mullowney   PetscFunctionReturn(0);
510bda325fcSPaul Mullowney }
511bda325fcSPaul Mullowney 
5129ae82921SPaul Mullowney 
5139ae82921SPaul Mullowney #undef __FUNCT__
5149ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
5159ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
5169ae82921SPaul Mullowney {
5179ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
5189ae82921SPaul Mullowney   PetscErrorCode               ierr;
519bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
5209ae82921SPaul Mullowney   cusparseStatus_t             stat;
5219ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5229ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
5239ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
5249ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
5259ae82921SPaul Mullowney 
5269ae82921SPaul Mullowney   PetscFunctionBegin;
527e057df02SPaul Mullowney   /* Get the GPU pointers */
5289ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5299ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
5309ae82921SPaul Mullowney 
531e057df02SPaul Mullowney   /* solve with reordering */
5329ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
5339ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
5349ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
5359ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
5369ae82921SPaul Mullowney 
5379ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
5389ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5399ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
5409ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5419ae82921SPaul Mullowney   PetscFunctionReturn(0);
5429ae82921SPaul Mullowney }
5439ae82921SPaul Mullowney 
5449ae82921SPaul Mullowney 
5459ae82921SPaul Mullowney 
5469ae82921SPaul Mullowney 
5479ae82921SPaul Mullowney #undef __FUNCT__
5489ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
5499ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
5509ae82921SPaul Mullowney {
5519ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
5529ae82921SPaul Mullowney   PetscErrorCode               ierr;
553bda325fcSPaul Mullowney   CUSPARRAY                    *xGPU, *bGPU;
5549ae82921SPaul Mullowney   cusparseStatus_t             stat;
5559ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5569ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
5579ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
5589ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
5599ae82921SPaul Mullowney 
5609ae82921SPaul Mullowney   PetscFunctionBegin;
561e057df02SPaul Mullowney   /* Get the GPU pointers */
5629ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5639ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
5649ae82921SPaul Mullowney 
565e057df02SPaul Mullowney   /* solve */
5669ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
5679ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
5689ae82921SPaul Mullowney 
5699ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
5709ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
5719ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
5729ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5739ae82921SPaul Mullowney   PetscFunctionReturn(0);
5749ae82921SPaul Mullowney }
5759ae82921SPaul Mullowney 
5769ae82921SPaul Mullowney #undef __FUNCT__
577e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
578e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
5799ae82921SPaul Mullowney {
5809ae82921SPaul Mullowney 
5819ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
5829ae82921SPaul Mullowney   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
5839ae82921SPaul Mullowney   PetscInt           m            = A->rmap->n,*ii,*ridx;
5849ae82921SPaul Mullowney   PetscErrorCode     ierr;
5859ae82921SPaul Mullowney 
5869ae82921SPaul Mullowney 
5879ae82921SPaul Mullowney   PetscFunctionBegin;
5889ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
5899ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5909ae82921SPaul Mullowney     /*
5919ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
5929ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
5939ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
5949ae82921SPaul Mullowney     */
5959ae82921SPaul Mullowney     if (cusparseMat->mat) {
5969ae82921SPaul Mullowney       try {
5979ae82921SPaul Mullowney         delete cusparseMat->mat;
5982205254eSKarl Rupp         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
5999ae82921SPaul Mullowney 
6009ae82921SPaul Mullowney       } catch(char *ex) {
6019ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6029ae82921SPaul Mullowney       }
6039ae82921SPaul Mullowney     }
6049ae82921SPaul Mullowney     try {
6059ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
6062205254eSKarl Rupp       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
6079ae82921SPaul Mullowney 
6089ae82921SPaul Mullowney       if (a->compressedrow.use) {
6099ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
6109ae82921SPaul Mullowney         ii   = a->compressedrow.i;
6119ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
6129ae82921SPaul Mullowney       } else {
613e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
6149ae82921SPaul Mullowney         int k=0;
6159ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
6169ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
6179ae82921SPaul Mullowney         ii[0]=0;
6189ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
6199ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
6209ae82921SPaul Mullowney             ii[k]  = a->i[j];
6219ae82921SPaul Mullowney             ridx[k]= j;
6229ae82921SPaul Mullowney             k++;
6239ae82921SPaul Mullowney           }
6249ae82921SPaul Mullowney         }
6259ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
6262205254eSKarl Rupp 
6279ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
6289ae82921SPaul Mullowney       }
6299ae82921SPaul Mullowney 
630e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
631e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
6329ae82921SPaul Mullowney 
633e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
6349ae82921SPaul Mullowney       PetscMPIInt size;
6359ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
6369ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
6379ae82921SPaul Mullowney 
638e057df02SPaul Mullowney       /* FILL MODE UPPER is irrelevant */
639bda325fcSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
640ca45077fSPaul Mullowney 
641e057df02SPaul Mullowney       /* lastly, build the matrix */
6429ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
6439ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
6449ae82921SPaul Mullowney       if (!a->compressedrow.use) {
6459ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
6469ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
6479ae82921SPaul Mullowney       }
6489ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
6499ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
6509ae82921SPaul Mullowney     } catch(char *ex) {
6519ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6529ae82921SPaul Mullowney     }
6539ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
6542205254eSKarl Rupp 
6559ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
6562205254eSKarl Rupp 
6579ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
6589ae82921SPaul Mullowney   }
6599ae82921SPaul Mullowney   PetscFunctionReturn(0);
6609ae82921SPaul Mullowney }
6619ae82921SPaul Mullowney 
6629ae82921SPaul Mullowney #undef __FUNCT__
6639ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
6649ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
6659ae82921SPaul Mullowney {
6669ae82921SPaul Mullowney   PetscErrorCode ierr;
6679ae82921SPaul Mullowney 
6689ae82921SPaul Mullowney   PetscFunctionBegin;
6699ae82921SPaul Mullowney   if (right) {
670*ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
6719ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6729ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
6739ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
6749ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
6759ae82921SPaul Mullowney   }
6769ae82921SPaul Mullowney   if (left) {
677*ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
6789ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6799ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
6809ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
6819ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
6829ae82921SPaul Mullowney   }
6839ae82921SPaul Mullowney   PetscFunctionReturn(0);
6849ae82921SPaul Mullowney }
6859ae82921SPaul Mullowney 
6869ae82921SPaul Mullowney #undef __FUNCT__
6879ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
6889ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
6899ae82921SPaul Mullowney {
6909ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
6919ae82921SPaul Mullowney   PetscErrorCode     ierr;
6929ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
693bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
6949ae82921SPaul Mullowney 
6959ae82921SPaul Mullowney   PetscFunctionBegin;
696e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
697e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
6989ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
6999ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
7009ae82921SPaul Mullowney   try {
7019ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
7029ae82921SPaul Mullowney   } catch (char *ex) {
7039ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7049ae82921SPaul Mullowney   }
7059ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7069ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
707ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
7089ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
709ca45077fSPaul Mullowney   }
7109ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
7119ae82921SPaul Mullowney   PetscFunctionReturn(0);
7129ae82921SPaul Mullowney }
7139ae82921SPaul Mullowney 
7149ae82921SPaul Mullowney 
7159ae82921SPaul Mullowney #undef __FUNCT__
716ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
717ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
718ca45077fSPaul Mullowney {
719ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
720ca45077fSPaul Mullowney   PetscErrorCode     ierr;
721ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
722bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray;
723ca45077fSPaul Mullowney 
724ca45077fSPaul Mullowney   PetscFunctionBegin;
725e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
726e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
727bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
728bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
729bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
730bda325fcSPaul Mullowney   }
731ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
732ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
733ca45077fSPaul Mullowney   try {
734ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
735ca45077fSPaul Mullowney   } catch (char *ex) {
736ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
737ca45077fSPaul Mullowney   }
738ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
739ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
740ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
741ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
742ca45077fSPaul Mullowney   }
743ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
744ca45077fSPaul Mullowney   PetscFunctionReturn(0);
745ca45077fSPaul Mullowney }
746ca45077fSPaul Mullowney 
747ca45077fSPaul Mullowney #undef __FUNCT__
7489ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
7499ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
7509ae82921SPaul Mullowney {
7519ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
7529ae82921SPaul Mullowney   PetscErrorCode     ierr;
7539ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
754bda325fcSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
7556e111a19SKarl Rupp 
7569ae82921SPaul Mullowney   PetscFunctionBegin;
757e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
758e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
7599ae82921SPaul Mullowney   try {
7609ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
7619ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
7629ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
7639ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
7649ae82921SPaul Mullowney 
765e057df02SPaul Mullowney     /* multiply add */
7669ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
7679ae82921SPaul Mullowney 
7689ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
7699ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
7709ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
7719ae82921SPaul Mullowney 
7729ae82921SPaul Mullowney   } catch(char *ex) {
7739ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7749ae82921SPaul Mullowney   }
7759ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
7769ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
7779ae82921SPaul Mullowney   PetscFunctionReturn(0);
7789ae82921SPaul Mullowney }
7799ae82921SPaul Mullowney 
7809ae82921SPaul Mullowney #undef __FUNCT__
781ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
782ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
783ca45077fSPaul Mullowney {
784ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
785ca45077fSPaul Mullowney   PetscErrorCode     ierr;
786ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
787ca45077fSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
7886e111a19SKarl Rupp 
789ca45077fSPaul Mullowney   PetscFunctionBegin;
790e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
791e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
792bda325fcSPaul Mullowney   if (!cusparseMat->hasTranspose) {
793bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
794bda325fcSPaul Mullowney     cusparseMat->hasTranspose=PETSC_TRUE;
795bda325fcSPaul Mullowney   }
796ca45077fSPaul Mullowney   try {
797ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
798ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
799ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
800ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
801ca45077fSPaul Mullowney 
802e057df02SPaul Mullowney     /* multiply add with matrix transpose */
803ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
804ca45077fSPaul Mullowney 
805ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
806ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
807ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
808ca45077fSPaul Mullowney 
809ca45077fSPaul Mullowney   } catch(char *ex) {
810ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
811ca45077fSPaul Mullowney   }
812ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
813ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
814ca45077fSPaul Mullowney   PetscFunctionReturn(0);
815ca45077fSPaul Mullowney }
816ca45077fSPaul Mullowney 
817ca45077fSPaul Mullowney #undef __FUNCT__
8189ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
8199ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
8209ae82921SPaul Mullowney {
8219ae82921SPaul Mullowney   PetscErrorCode ierr;
8226e111a19SKarl Rupp 
8239ae82921SPaul Mullowney   PetscFunctionBegin;
8249ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
825e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8269ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
827bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
828bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
829bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
830bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
8319ae82921SPaul Mullowney   PetscFunctionReturn(0);
8329ae82921SPaul Mullowney }
8339ae82921SPaul Mullowney 
8349ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
8359ae82921SPaul Mullowney #undef __FUNCT__
8369ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
837e057df02SPaul Mullowney /*@
8389ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
839e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
840e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
841e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
842e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
843e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
8449ae82921SPaul Mullowney 
8459ae82921SPaul Mullowney    Collective on MPI_Comm
8469ae82921SPaul Mullowney 
8479ae82921SPaul Mullowney    Input Parameters:
8489ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
8499ae82921SPaul Mullowney .  m - number of rows
8509ae82921SPaul Mullowney .  n - number of columns
8519ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
8529ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
8530298fd71SBarry Smith          (possibly different for each row) or NULL
8549ae82921SPaul Mullowney 
8559ae82921SPaul Mullowney    Output Parameter:
8569ae82921SPaul Mullowney .  A - the matrix
8579ae82921SPaul Mullowney 
8589ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8599ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8609ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8619ae82921SPaul Mullowney 
8629ae82921SPaul Mullowney    Notes:
8639ae82921SPaul Mullowney    If nnz is given then nz is ignored
8649ae82921SPaul Mullowney 
8659ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
8669ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
8679ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
8689ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
8699ae82921SPaul Mullowney 
8709ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
8710298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
8729ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
8739ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
8749ae82921SPaul Mullowney 
8759ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
8769ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
8779ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
8789ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
8799ae82921SPaul Mullowney 
8809ae82921SPaul Mullowney    Level: intermediate
8819ae82921SPaul Mullowney 
882e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
8839ae82921SPaul Mullowney @*/
8849ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8859ae82921SPaul Mullowney {
8869ae82921SPaul Mullowney   PetscErrorCode ierr;
8879ae82921SPaul Mullowney 
8889ae82921SPaul Mullowney   PetscFunctionBegin;
8899ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
8909ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
8919ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8929ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
8939ae82921SPaul Mullowney   PetscFunctionReturn(0);
8949ae82921SPaul Mullowney }
8959ae82921SPaul Mullowney 
8969ae82921SPaul Mullowney #undef __FUNCT__
8979ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
8989ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
8999ae82921SPaul Mullowney {
9009ae82921SPaul Mullowney   PetscErrorCode     ierr;
9019ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
9029ae82921SPaul Mullowney 
9039ae82921SPaul Mullowney   PetscFunctionBegin;
9049ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
9059ae82921SPaul Mullowney     try {
9069ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
9079ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
9089ae82921SPaul Mullowney       }
9092205254eSKarl Rupp       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
9109ae82921SPaul Mullowney       delete cusparseMat;
9119ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9129ae82921SPaul Mullowney     } catch(char *ex) {
9139ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9149ae82921SPaul Mullowney     }
9159ae82921SPaul Mullowney   } else {
916e057df02SPaul Mullowney     /* The triangular factors */
9179ae82921SPaul Mullowney     try {
9189ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
9199ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
9209ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
9219ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatLo;
9229ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatUp;
9239ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
9249ae82921SPaul Mullowney       delete cusparseTriFactors;
9259ae82921SPaul Mullowney     } catch(char *ex) {
9269ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
9279ae82921SPaul Mullowney     }
9289ae82921SPaul Mullowney   }
9299ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
9309ae82921SPaul Mullowney     cusparseStatus_t stat;
9319ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
9322205254eSKarl Rupp 
9339ae82921SPaul Mullowney     MAT_cusparseHandle=0;
9349ae82921SPaul Mullowney   }
9359ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
9369ae82921SPaul Mullowney   A->spptr = 0;
9379ae82921SPaul Mullowney 
9389ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
9399ae82921SPaul Mullowney   PetscFunctionReturn(0);
9409ae82921SPaul Mullowney }
9419ae82921SPaul Mullowney 
9429ae82921SPaul Mullowney EXTERN_C_BEGIN
9439ae82921SPaul Mullowney #undef __FUNCT__
9449ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
9459ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
9469ae82921SPaul Mullowney {
9479ae82921SPaul Mullowney   PetscErrorCode ierr;
9489ae82921SPaul Mullowney 
9499ae82921SPaul Mullowney   PetscFunctionBegin;
9509ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
9519ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
952e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
953e057df02SPaul Mullowney        now build a GPU matrix data structure */
9549ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
9552205254eSKarl Rupp 
9569ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
9579ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
958e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
959bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
9609ae82921SPaul Mullowney   } else {
9619ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
962debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
9632205254eSKarl Rupp 
9649ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
9659ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
9669ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
967e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
968bda325fcSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
9699ae82921SPaul Mullowney   }
970e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
9719ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
9729ae82921SPaul Mullowney     cusparseStatus_t stat;
9739ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
9749ae82921SPaul Mullowney   }
975e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
976e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
977e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
9789ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
9792205254eSKarl Rupp 
9809ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
9819ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
9829ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
9839ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
984ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
985ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
986ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
987ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
9882205254eSKarl Rupp 
9899ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
9902205254eSKarl Rupp 
9919ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
9922205254eSKarl Rupp 
993e057df02SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
9949ae82921SPaul Mullowney   PetscFunctionReturn(0);
9959ae82921SPaul Mullowney }
9969ae82921SPaul Mullowney EXTERN_C_END
9979ae82921SPaul Mullowney 
998e057df02SPaul Mullowney /*M
999e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1000e057df02SPaul Mullowney 
1001e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1002e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1003e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1004e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1005e057df02SPaul Mullowney    the different GPU storage formats.
1006e057df02SPaul Mullowney 
1007e057df02SPaul Mullowney    Options Database Keys:
1008e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
10098468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
10108468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
10118468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1012e057df02SPaul Mullowney 
1013e057df02SPaul Mullowney   Level: beginner
1014e057df02SPaul Mullowney 
10158468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1016e057df02SPaul Mullowney M*/
1017