xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
15e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
169ae82921SPaul Mullowney 
17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
18e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
20e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
21e057df02SPaul Mullowney    and reuse it. */
22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
359ae82921SPaul Mullowney 
369ae82921SPaul Mullowney #undef __FUNCT__
379ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
389ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
399ae82921SPaul Mullowney {
409ae82921SPaul Mullowney   PetscFunctionBegin;
419ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
429ae82921SPaul Mullowney   PetscFunctionReturn(0);
439ae82921SPaul Mullowney }
449ae82921SPaul Mullowney 
459ae82921SPaul Mullowney EXTERN_C_BEGIN
469ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
479ae82921SPaul Mullowney EXTERN_C_END
48e057df02SPaul Mullowney /*
499ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
509ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
519ae82921SPaul Mullowney 
529ae82921SPaul Mullowney    Level: beginner
53e057df02SPaul Mullowney */
549ae82921SPaul Mullowney 
559ae82921SPaul Mullowney EXTERN_C_BEGIN
569ae82921SPaul Mullowney #undef __FUNCT__
579ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
589ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
599ae82921SPaul Mullowney {
609ae82921SPaul Mullowney   PetscErrorCode ierr;
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   PetscFunctionBegin;
639ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
649ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
659ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
669ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
679ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
68*2205254eSKarl Rupp 
699ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
709ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
719ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
729ae82921SPaul Mullowney   (*B)->factortype = ftype;
739ae82921SPaul Mullowney   PetscFunctionReturn(0);
749ae82921SPaul Mullowney }
759ae82921SPaul Mullowney EXTERN_C_END
769ae82921SPaul Mullowney 
77e057df02SPaul Mullowney EXTERN_C_BEGIN
789ae82921SPaul Mullowney #undef __FUNCT__
79e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
80e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
81ca45077fSPaul Mullowney {
82ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
836e111a19SKarl Rupp 
84ca45077fSPaul Mullowney   PetscFunctionBegin;
85ca45077fSPaul Mullowney   switch (op) {
86e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
87e057df02SPaul Mullowney     cusparseMat->format = format;
88ca45077fSPaul Mullowney     break;
89e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
90e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
91ca45077fSPaul Mullowney     break;
92e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
93e057df02SPaul Mullowney     cusparseMat->format           = format;
94e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
95ca45077fSPaul Mullowney     break;
96ca45077fSPaul Mullowney   default:
97e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
98ca45077fSPaul Mullowney   }
99ca45077fSPaul Mullowney   PetscFunctionReturn(0);
100ca45077fSPaul Mullowney }
101e057df02SPaul Mullowney EXTERN_C_END
1029ae82921SPaul Mullowney 
103e057df02SPaul Mullowney 
104e057df02SPaul Mullowney /*@
105e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
106e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
1078468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
108e057df02SPaul Mullowney    to build/install PETSc to use this package.
109e057df02SPaul Mullowney 
110e057df02SPaul Mullowney    Not Collective
111e057df02SPaul Mullowney 
112e057df02SPaul Mullowney    Input Parameters:
1138468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1148468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1158468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
116e057df02SPaul Mullowney 
117e057df02SPaul Mullowney    Output Parameter:
118e057df02SPaul Mullowney 
119e057df02SPaul Mullowney    Level: intermediate
120e057df02SPaul Mullowney 
1218468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
122e057df02SPaul Mullowney @*/
123e057df02SPaul Mullowney #undef __FUNCT__
124e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
125e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
126e057df02SPaul Mullowney {
127e057df02SPaul Mullowney   PetscErrorCode ierr;
1286e111a19SKarl Rupp 
129e057df02SPaul Mullowney   PetscFunctionBegin;
130e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
131e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
132e057df02SPaul Mullowney   PetscFunctionReturn(0);
133e057df02SPaul Mullowney }
134e057df02SPaul Mullowney 
1359ae82921SPaul Mullowney #undef __FUNCT__
1369ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1379ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1389ae82921SPaul Mullowney {
1399ae82921SPaul Mullowney   PetscErrorCode           ierr;
140e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1419ae82921SPaul Mullowney   PetscBool                flg;
1426e111a19SKarl Rupp 
1439ae82921SPaul Mullowney   PetscFunctionBegin;
144e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
145e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1469ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
147e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1487083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
149e057df02SPaul Mullowney     if (flg) {
150e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
151045c96e1SPaul Mullowney     }
152*2205254eSKarl Rupp   } else {
153e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1547083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
155e057df02SPaul Mullowney     if (flg) {
156e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
157e057df02SPaul Mullowney     }
1589ae82921SPaul Mullowney   }
1594c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1607083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1614c87dfd4SPaul Mullowney   if (flg) {
1624c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1634c87dfd4SPaul Mullowney   }
1649ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1659ae82921SPaul Mullowney   PetscFunctionReturn(0);
1669ae82921SPaul Mullowney 
1679ae82921SPaul Mullowney }
1689ae82921SPaul Mullowney 
1699ae82921SPaul Mullowney #undef __FUNCT__
1709ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1719ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1729ae82921SPaul Mullowney {
1739ae82921SPaul Mullowney   PetscErrorCode ierr;
1749ae82921SPaul Mullowney 
1759ae82921SPaul Mullowney   PetscFunctionBegin;
1769ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
177*2205254eSKarl Rupp 
1789ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1799ae82921SPaul Mullowney   PetscFunctionReturn(0);
1809ae82921SPaul Mullowney }
1819ae82921SPaul Mullowney 
1829ae82921SPaul Mullowney #undef __FUNCT__
1839ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1849ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1859ae82921SPaul Mullowney {
1869ae82921SPaul Mullowney   PetscErrorCode ierr;
1879ae82921SPaul Mullowney 
1889ae82921SPaul Mullowney   PetscFunctionBegin;
1899ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
190*2205254eSKarl Rupp 
1919ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1929ae82921SPaul Mullowney   PetscFunctionReturn(0);
1939ae82921SPaul Mullowney }
1949ae82921SPaul Mullowney 
1959ae82921SPaul Mullowney #undef __FUNCT__
196e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
197e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
1989ae82921SPaul Mullowney {
1999ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2009ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2019ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2029ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2039ae82921SPaul Mullowney   cusparseStatus_t             stat;
2049ae82921SPaul Mullowney   const PetscInt               *ai = a->i,*aj = a->j,*vi;
2059ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2069ae82921SPaul Mullowney   PetscErrorCode               ierr;
2079ae82921SPaul Mullowney   PetscInt                     *AiLo, *AjLo;
2089ae82921SPaul Mullowney   PetscScalar                  *AALo;
2099ae82921SPaul Mullowney   PetscInt                     i,nz, nzLower, offset, rowOffset;
2109ae82921SPaul Mullowney 
2119ae82921SPaul Mullowney   PetscFunctionBegin;
2129ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2139ae82921SPaul Mullowney     try {
2149ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2159ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2169ae82921SPaul Mullowney 
2179ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2189ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2199ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2209ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2219ae82921SPaul Mullowney 
2229ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2239ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2249ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2259ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2269ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2279ae82921SPaul Mullowney       v        = aa;
2289ae82921SPaul Mullowney       vi       = aj;
2299ae82921SPaul Mullowney       offset   = 1;
2309ae82921SPaul Mullowney       rowOffset= 1;
2319ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2329ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
233e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2349ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2359ae82921SPaul Mullowney         rowOffset += nz+1;
2369ae82921SPaul Mullowney 
2379ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2389ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2399ae82921SPaul Mullowney 
2409ae82921SPaul Mullowney         offset      += nz;
2419ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2429ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2439ae82921SPaul Mullowney         offset      += 1;
2449ae82921SPaul Mullowney 
2459ae82921SPaul Mullowney         v  += nz;
2469ae82921SPaul Mullowney         vi += nz;
2479ae82921SPaul Mullowney       }
248e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
249*2205254eSKarl Rupp 
2509ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2519ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2529ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
253*2205254eSKarl Rupp 
2549ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
255*2205254eSKarl Rupp 
2569ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2579ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2589ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2599ae82921SPaul Mullowney     } catch(char* ex) {
2609ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2619ae82921SPaul Mullowney     }
2629ae82921SPaul Mullowney   }
2639ae82921SPaul Mullowney   PetscFunctionReturn(0);
2649ae82921SPaul Mullowney }
2659ae82921SPaul Mullowney 
2669ae82921SPaul Mullowney #undef __FUNCT__
267e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
268e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
2699ae82921SPaul Mullowney {
2709ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
2719ae82921SPaul Mullowney   PetscInt                     n                   = A->rmap->n;
2729ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2739ae82921SPaul Mullowney   GPU_Matrix_Ifc               * cusparseMat       = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2749ae82921SPaul Mullowney   cusparseStatus_t             stat;
2759ae82921SPaul Mullowney   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
2769ae82921SPaul Mullowney   const MatScalar              *aa = a->a,*v;
2779ae82921SPaul Mullowney   PetscInt                     *AiUp, *AjUp;
2789ae82921SPaul Mullowney   PetscScalar                  *AAUp;
2799ae82921SPaul Mullowney   PetscInt                     i,nz, nzUpper, offset;
2809ae82921SPaul Mullowney   PetscErrorCode               ierr;
2819ae82921SPaul Mullowney 
2829ae82921SPaul Mullowney   PetscFunctionBegin;
2839ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2849ae82921SPaul Mullowney     try {
2859ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
2869ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
2879ae82921SPaul Mullowney 
2889ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
2899ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2909ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
2919ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
2929ae82921SPaul Mullowney 
2939ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
2949ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
2959ae82921SPaul Mullowney       AiUp[n]=nzUpper;
2969ae82921SPaul Mullowney       offset = nzUpper;
2979ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
2989ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
2999ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3009ae82921SPaul Mullowney 
301e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3029ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3039ae82921SPaul Mullowney 
304e057df02SPaul Mullowney         /* decrement the offset */
3059ae82921SPaul Mullowney         offset -= (nz+1);
3069ae82921SPaul Mullowney 
307e057df02SPaul Mullowney         /* first, set the diagonal elements */
3089ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3099ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3109ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3119ae82921SPaul Mullowney 
3129ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3139ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3149ae82921SPaul Mullowney       }
315e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
316*2205254eSKarl Rupp 
3179ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3189ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3199ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
320*2205254eSKarl Rupp 
3219ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
322*2205254eSKarl Rupp 
3239ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3249ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3259ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3269ae82921SPaul Mullowney     } catch(char* ex) {
3279ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3289ae82921SPaul Mullowney     }
3299ae82921SPaul Mullowney   }
3309ae82921SPaul Mullowney   PetscFunctionReturn(0);
3319ae82921SPaul Mullowney }
3329ae82921SPaul Mullowney 
3339ae82921SPaul Mullowney #undef __FUNCT__
334a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
335e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
3369ae82921SPaul Mullowney {
3379ae82921SPaul Mullowney   PetscErrorCode               ierr;
3389ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
3399ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3409ae82921SPaul Mullowney   IS                           isrow               = a->row,iscol = a->icol;
3419ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
3429ae82921SPaul Mullowney   const PetscInt               *r,*c;
3439ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
3449ae82921SPaul Mullowney 
3459ae82921SPaul Mullowney   PetscFunctionBegin;
346e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
347e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
348*2205254eSKarl Rupp 
3499ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3509ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3519ae82921SPaul Mullowney 
3529ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
353e057df02SPaul Mullowney   /*lower triangular indices */
3549ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3559ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
356*2205254eSKarl Rupp   if (!row_identity) {
3579ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
358*2205254eSKarl Rupp   }
3599ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3609ae82921SPaul Mullowney 
361e057df02SPaul Mullowney   /*upper triangular indices */
3629ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3639ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
364*2205254eSKarl Rupp   if (!col_identity) {
3659ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
366*2205254eSKarl Rupp   }
3679ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3689ae82921SPaul Mullowney   PetscFunctionReturn(0);
3699ae82921SPaul Mullowney }
3709ae82921SPaul Mullowney 
3719ae82921SPaul Mullowney #undef __FUNCT__
3729ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3739ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3749ae82921SPaul Mullowney {
3759ae82921SPaul Mullowney   PetscErrorCode ierr;
3769ae82921SPaul Mullowney   Mat_SeqAIJ     *b    =(Mat_SeqAIJ*)B->data;
3779ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
3789ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney   PetscFunctionBegin;
3819ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
382e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
3839ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3849ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3859ae82921SPaul Mullowney   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
3869ae82921SPaul Mullowney   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
3878dc1d2a3SPaul Mullowney 
388e057df02SPaul Mullowney   /* get the triangular factors */
389e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
3909ae82921SPaul Mullowney   PetscFunctionReturn(0);
3919ae82921SPaul Mullowney }
3929ae82921SPaul Mullowney 
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney #undef __FUNCT__
3969ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
3979ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
3989ae82921SPaul Mullowney {
3999ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
4009ae82921SPaul Mullowney   PetscErrorCode               ierr;
401da80777bSKarl Rupp   CUSPARRAY                    *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
4029ae82921SPaul Mullowney   cusparseStatus_t             stat;
4039ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4049ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4059ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4069ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
4079ae82921SPaul Mullowney 
4089ae82921SPaul Mullowney   PetscFunctionBegin;
409e057df02SPaul Mullowney   /* Get the GPU pointers */
4109ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4119ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4129ae82921SPaul Mullowney 
413e057df02SPaul Mullowney   /* solve with reordering */
4149ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
4159ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
4169ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4179ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
4189ae82921SPaul Mullowney 
4199ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4209ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4219ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4229ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4239ae82921SPaul Mullowney   PetscFunctionReturn(0);
4249ae82921SPaul Mullowney }
4259ae82921SPaul Mullowney 
4269ae82921SPaul Mullowney 
4279ae82921SPaul Mullowney 
4289ae82921SPaul Mullowney 
4299ae82921SPaul Mullowney #undef __FUNCT__
4309ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
4319ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
4329ae82921SPaul Mullowney {
4339ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
4349ae82921SPaul Mullowney   PetscErrorCode               ierr;
435da80777bSKarl Rupp   CUSPARRAY                    *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
4369ae82921SPaul Mullowney   cusparseStatus_t             stat;
4379ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4389ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4399ae82921SPaul Mullowney   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4409ae82921SPaul Mullowney   CUSPARRAY                    * tempGPU           = (CUSPARRAY*) cusparseTriFactors->tempvec;
4419ae82921SPaul Mullowney 
4429ae82921SPaul Mullowney   PetscFunctionBegin;
443e057df02SPaul Mullowney   /* Get the GPU pointers */
4449ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4459ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4469ae82921SPaul Mullowney 
447e057df02SPaul Mullowney   /* solve */
4489ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
4499ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4509ae82921SPaul Mullowney 
4519ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4529ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4539ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4549ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4559ae82921SPaul Mullowney   PetscFunctionReturn(0);
4569ae82921SPaul Mullowney }
4579ae82921SPaul Mullowney 
4589ae82921SPaul Mullowney #undef __FUNCT__
459e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
460e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
4619ae82921SPaul Mullowney {
4629ae82921SPaul Mullowney 
4639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
4649ae82921SPaul Mullowney   Mat_SeqAIJ         *a           = (Mat_SeqAIJ*)A->data;
4659ae82921SPaul Mullowney   PetscInt           m            = A->rmap->n,*ii,*ridx;
4669ae82921SPaul Mullowney   PetscErrorCode     ierr;
4679ae82921SPaul Mullowney 
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney   PetscFunctionBegin;
4709ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
4719ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
4729ae82921SPaul Mullowney     /*
4739ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
4749ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
4759ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
4769ae82921SPaul Mullowney     */
4779ae82921SPaul Mullowney     if (cusparseMat->mat) {
4789ae82921SPaul Mullowney       try {
4799ae82921SPaul Mullowney         delete cusparseMat->mat;
480*2205254eSKarl Rupp         if (cusparseMat->tempvec) delete cusparseMat->tempvec;
4819ae82921SPaul Mullowney 
4829ae82921SPaul Mullowney       } catch(char *ex) {
4839ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4849ae82921SPaul Mullowney       }
4859ae82921SPaul Mullowney     }
4869ae82921SPaul Mullowney     try {
4879ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
488*2205254eSKarl Rupp       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
4899ae82921SPaul Mullowney 
4909ae82921SPaul Mullowney       if (a->compressedrow.use) {
4919ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
4929ae82921SPaul Mullowney         ii   = a->compressedrow.i;
4939ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
4949ae82921SPaul Mullowney       } else {
495e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
4969ae82921SPaul Mullowney         int k=0;
4979ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
4989ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
4999ae82921SPaul Mullowney         ii[0]=0;
5009ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
5019ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
5029ae82921SPaul Mullowney             ii[k]  = a->i[j];
5039ae82921SPaul Mullowney             ridx[k]= j;
5049ae82921SPaul Mullowney             k++;
5059ae82921SPaul Mullowney           }
5069ae82921SPaul Mullowney         }
5079ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
508*2205254eSKarl Rupp 
5099ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
5109ae82921SPaul Mullowney       }
5119ae82921SPaul Mullowney 
512e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
513e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
5149ae82921SPaul Mullowney 
515e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
5169ae82921SPaul Mullowney       PetscMPIInt size;
5179ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
5189ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
5199ae82921SPaul Mullowney 
520e057df02SPaul Mullowney       /* FILL MODE UPPER is irrelevant */
521ca45077fSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
522ca45077fSPaul Mullowney 
523e057df02SPaul Mullowney       /* lastly, build the matrix */
5249ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
5259ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
5269ae82921SPaul Mullowney       if (!a->compressedrow.use) {
5279ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
5289ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
5299ae82921SPaul Mullowney       }
5309ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
5319ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
5329ae82921SPaul Mullowney     } catch(char *ex) {
5339ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5349ae82921SPaul Mullowney     }
5359ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
536*2205254eSKarl Rupp 
5379ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
538*2205254eSKarl Rupp 
5399ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5409ae82921SPaul Mullowney   }
5419ae82921SPaul Mullowney   PetscFunctionReturn(0);
5429ae82921SPaul Mullowney }
5439ae82921SPaul Mullowney 
5449ae82921SPaul Mullowney #undef __FUNCT__
5459ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
5469ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
5479ae82921SPaul Mullowney {
5489ae82921SPaul Mullowney   PetscErrorCode ierr;
5499ae82921SPaul Mullowney 
5509ae82921SPaul Mullowney   PetscFunctionBegin;
5519ae82921SPaul Mullowney   if (right) {
5529ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
5539ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
5549ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
5559ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
5569ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
5579ae82921SPaul Mullowney   }
5589ae82921SPaul Mullowney   if (left) {
5599ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
5609ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
5619ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
5629ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
5639ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
5649ae82921SPaul Mullowney   }
5659ae82921SPaul Mullowney   PetscFunctionReturn(0);
5669ae82921SPaul Mullowney }
5679ae82921SPaul Mullowney 
5689ae82921SPaul Mullowney #undef __FUNCT__
5699ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
5709ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
5719ae82921SPaul Mullowney {
5729ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
5739ae82921SPaul Mullowney   PetscErrorCode     ierr;
5749ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
575da80777bSKarl Rupp   CUSPARRAY          *xarray      =PETSC_NULL,*yarray=PETSC_NULL;
5769ae82921SPaul Mullowney 
5779ae82921SPaul Mullowney   PetscFunctionBegin;
578e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
579e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
5809ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
5819ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
5829ae82921SPaul Mullowney   try {
5839ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
5849ae82921SPaul Mullowney   } catch (char *ex) {
5859ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5869ae82921SPaul Mullowney   }
5879ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
5889ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
589ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
5909ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
591ca45077fSPaul Mullowney   }
5929ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
5939ae82921SPaul Mullowney   PetscFunctionReturn(0);
5949ae82921SPaul Mullowney }
5959ae82921SPaul Mullowney 
5969ae82921SPaul Mullowney 
5979ae82921SPaul Mullowney #undef __FUNCT__
598ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
599ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
600ca45077fSPaul Mullowney {
601ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
602ca45077fSPaul Mullowney   PetscErrorCode     ierr;
603ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
604da80777bSKarl Rupp   CUSPARRAY          *xarray      =PETSC_NULL,*yarray=PETSC_NULL;
605ca45077fSPaul Mullowney 
606ca45077fSPaul Mullowney   PetscFunctionBegin;
607e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
608e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
609ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
610ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
611ca45077fSPaul Mullowney   try {
612ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
613ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
614ca45077fSPaul Mullowney #else
615ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
616ca45077fSPaul Mullowney #endif
617ca45077fSPaul Mullowney   } catch (char *ex) {
618ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
619ca45077fSPaul Mullowney   }
620ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
621ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
622ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
623ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
624ca45077fSPaul Mullowney   }
625ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
626ca45077fSPaul Mullowney   PetscFunctionReturn(0);
627ca45077fSPaul Mullowney }
628ca45077fSPaul Mullowney 
629ca45077fSPaul Mullowney #undef __FUNCT__
6309ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
6319ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
6329ae82921SPaul Mullowney {
6339ae82921SPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
6349ae82921SPaul Mullowney   PetscErrorCode     ierr;
6359ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
636da80777bSKarl Rupp   CUSPARRAY          *xarray      = PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL;
6376e111a19SKarl Rupp 
6389ae82921SPaul Mullowney   PetscFunctionBegin;
639e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
640e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
6419ae82921SPaul Mullowney   try {
6429ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
6439ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
6449ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
6459ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
6469ae82921SPaul Mullowney 
647e057df02SPaul Mullowney     /* multiply add */
6489ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
6499ae82921SPaul Mullowney 
6509ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
6519ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
6529ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
6539ae82921SPaul Mullowney 
6549ae82921SPaul Mullowney   } catch(char *ex) {
6559ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6569ae82921SPaul Mullowney   }
6579ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
6589ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
6599ae82921SPaul Mullowney   PetscFunctionReturn(0);
6609ae82921SPaul Mullowney }
6619ae82921SPaul Mullowney 
6629ae82921SPaul Mullowney #undef __FUNCT__
663ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
664ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
665ca45077fSPaul Mullowney {
666ca45077fSPaul Mullowney   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
667ca45077fSPaul Mullowney   PetscErrorCode     ierr;
668ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
669ca45077fSPaul Mullowney   CUSPARRAY          *xarray,*yarray,*zarray;
6706e111a19SKarl Rupp 
671ca45077fSPaul Mullowney   PetscFunctionBegin;
672e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
673e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
674ca45077fSPaul Mullowney   try {
675ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
676ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
677ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
678ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
679ca45077fSPaul Mullowney 
680e057df02SPaul Mullowney     /* multiply add with matrix transpose */
681ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
682ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
683ca45077fSPaul Mullowney #else
684ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
685ca45077fSPaul Mullowney #endif
686ca45077fSPaul Mullowney 
687ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
688ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
689ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
690ca45077fSPaul Mullowney 
691ca45077fSPaul Mullowney   } catch(char *ex) {
692ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
693ca45077fSPaul Mullowney   }
694ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
695ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
696ca45077fSPaul Mullowney   PetscFunctionReturn(0);
697ca45077fSPaul Mullowney }
698ca45077fSPaul Mullowney 
699ca45077fSPaul Mullowney #undef __FUNCT__
7009ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
7019ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
7029ae82921SPaul Mullowney {
7039ae82921SPaul Mullowney   PetscErrorCode ierr;
7046e111a19SKarl Rupp 
7059ae82921SPaul Mullowney   PetscFunctionBegin;
7069ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
707e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
7089ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
709bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
710bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
711bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
712bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
7139ae82921SPaul Mullowney   PetscFunctionReturn(0);
7149ae82921SPaul Mullowney }
7159ae82921SPaul Mullowney 
7169ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
7179ae82921SPaul Mullowney #undef __FUNCT__
7189ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
719e057df02SPaul Mullowney /*@
7209ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
721e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
722e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
723e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
724e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
725e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7269ae82921SPaul Mullowney 
7279ae82921SPaul Mullowney    Collective on MPI_Comm
7289ae82921SPaul Mullowney 
7299ae82921SPaul Mullowney    Input Parameters:
7309ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
7319ae82921SPaul Mullowney .  m - number of rows
7329ae82921SPaul Mullowney .  n - number of columns
7339ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
7349ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7359ae82921SPaul Mullowney          (possibly different for each row) or PETSC_NULL
7369ae82921SPaul Mullowney 
7379ae82921SPaul Mullowney    Output Parameter:
7389ae82921SPaul Mullowney .  A - the matrix
7399ae82921SPaul Mullowney 
7409ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
7419ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
7429ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
7439ae82921SPaul Mullowney 
7449ae82921SPaul Mullowney    Notes:
7459ae82921SPaul Mullowney    If nnz is given then nz is ignored
7469ae82921SPaul Mullowney 
7479ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
7489ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
7499ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
7509ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
7519ae82921SPaul Mullowney 
7529ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7539ae82921SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
7549ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
7559ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
7569ae82921SPaul Mullowney 
7579ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
7589ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
7599ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
7609ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
7619ae82921SPaul Mullowney 
7629ae82921SPaul Mullowney    Level: intermediate
7639ae82921SPaul Mullowney 
764e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
7659ae82921SPaul Mullowney @*/
7669ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
7679ae82921SPaul Mullowney {
7689ae82921SPaul Mullowney   PetscErrorCode ierr;
7699ae82921SPaul Mullowney 
7709ae82921SPaul Mullowney   PetscFunctionBegin;
7719ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7729ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
7739ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
7749ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
7759ae82921SPaul Mullowney   PetscFunctionReturn(0);
7769ae82921SPaul Mullowney }
7779ae82921SPaul Mullowney 
7789ae82921SPaul Mullowney #undef __FUNCT__
7799ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
7809ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
7819ae82921SPaul Mullowney {
7829ae82921SPaul Mullowney   PetscErrorCode     ierr;
7839ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
7849ae82921SPaul Mullowney 
7859ae82921SPaul Mullowney   PetscFunctionBegin;
7869ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
7879ae82921SPaul Mullowney     try {
7889ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
7899ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
7909ae82921SPaul Mullowney       }
791*2205254eSKarl Rupp       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
7929ae82921SPaul Mullowney       delete cusparseMat;
7939ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
7949ae82921SPaul Mullowney     } catch(char *ex) {
7959ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7969ae82921SPaul Mullowney     }
7979ae82921SPaul Mullowney   } else {
798e057df02SPaul Mullowney     /* The triangular factors */
7999ae82921SPaul Mullowney     try {
8009ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
8019ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
8029ae82921SPaul Mullowney       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
8039ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatLo;
8049ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc*) cusparseMatUp;
8059ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
8069ae82921SPaul Mullowney       delete cusparseTriFactors;
8079ae82921SPaul Mullowney     } catch(char *ex) {
8089ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
8099ae82921SPaul Mullowney     }
8109ae82921SPaul Mullowney   }
8119ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
8129ae82921SPaul Mullowney     cusparseStatus_t stat;
8139ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
814*2205254eSKarl Rupp 
8159ae82921SPaul Mullowney     MAT_cusparseHandle=0;
8169ae82921SPaul Mullowney   }
8179ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
8189ae82921SPaul Mullowney   A->spptr = 0;
8199ae82921SPaul Mullowney 
8209ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
8219ae82921SPaul Mullowney   PetscFunctionReturn(0);
8229ae82921SPaul Mullowney }
8239ae82921SPaul Mullowney 
8249ae82921SPaul Mullowney EXTERN_C_BEGIN
8259ae82921SPaul Mullowney #undef __FUNCT__
8269ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
8279ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
8289ae82921SPaul Mullowney {
8299ae82921SPaul Mullowney   PetscErrorCode ierr;
8309ae82921SPaul Mullowney 
8319ae82921SPaul Mullowney   PetscFunctionBegin;
8329ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
8339ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
834e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
835e057df02SPaul Mullowney        now build a GPU matrix data structure */
8369ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
837*2205254eSKarl Rupp 
8389ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat     = 0;
8399ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec = 0;
840e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format  = MAT_CUSPARSE_CSR;
8419ae82921SPaul Mullowney   } else {
8429ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
843debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
844*2205254eSKarl Rupp 
8459ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
8469ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
8479ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
848e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
8499ae82921SPaul Mullowney   }
850e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
8519ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
8529ae82921SPaul Mullowney     cusparseStatus_t stat;
8539ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
8549ae82921SPaul Mullowney   }
855e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
856e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
857e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
8589ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
859*2205254eSKarl Rupp 
8609ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
8619ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
8629ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
8639ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
864ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
865ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
866ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
867ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
868*2205254eSKarl Rupp 
8699ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
870*2205254eSKarl Rupp 
8719ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
872*2205254eSKarl Rupp 
873e057df02SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
8749ae82921SPaul Mullowney   PetscFunctionReturn(0);
8759ae82921SPaul Mullowney }
8769ae82921SPaul Mullowney EXTERN_C_END
8779ae82921SPaul Mullowney 
878e057df02SPaul Mullowney /*M
879e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
880e057df02SPaul Mullowney 
881e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
882e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
883e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
884e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
885e057df02SPaul Mullowney    the different GPU storage formats.
886e057df02SPaul Mullowney 
887e057df02SPaul Mullowney    Options Database Keys:
888e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
8898468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
8908468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
8918468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
892e057df02SPaul Mullowney 
893e057df02SPaul Mullowney   Level: beginner
894e057df02SPaul Mullowney 
8958468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
896e057df02SPaul Mullowney M*/
897