xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 8468deee71cff1b2e1654d636f4c743a8bd8dfe8)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
15e057df02SPaul Mullowney const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
169ae82921SPaul Mullowney 
17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
18e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
20e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
21e057df02SPaul Mullowney    and reuse it. */
22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
359ae82921SPaul Mullowney 
369ae82921SPaul Mullowney #undef __FUNCT__
379ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
389ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
399ae82921SPaul Mullowney {
409ae82921SPaul Mullowney   PetscFunctionBegin;
419ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
429ae82921SPaul Mullowney   PetscFunctionReturn(0);
439ae82921SPaul Mullowney }
449ae82921SPaul Mullowney 
459ae82921SPaul Mullowney EXTERN_C_BEGIN
469ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
479ae82921SPaul Mullowney EXTERN_C_END
48e057df02SPaul Mullowney /*
499ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
509ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
519ae82921SPaul Mullowney 
529ae82921SPaul Mullowney    Level: beginner
53e057df02SPaul Mullowney */
549ae82921SPaul Mullowney 
559ae82921SPaul Mullowney EXTERN_C_BEGIN
569ae82921SPaul Mullowney #undef __FUNCT__
579ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
589ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
599ae82921SPaul Mullowney {
609ae82921SPaul Mullowney   PetscErrorCode     ierr;
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   PetscFunctionBegin;
639ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
649ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
659ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
669ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
679ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
689ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
699ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
709ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
719ae82921SPaul Mullowney   (*B)->factortype = ftype;
729ae82921SPaul Mullowney   PetscFunctionReturn(0);
739ae82921SPaul Mullowney }
749ae82921SPaul Mullowney EXTERN_C_END
759ae82921SPaul Mullowney 
76e057df02SPaul Mullowney EXTERN_C_BEGIN
779ae82921SPaul Mullowney #undef __FUNCT__
78e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
79e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
80ca45077fSPaul Mullowney {
81ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
826e111a19SKarl Rupp 
83ca45077fSPaul Mullowney   PetscFunctionBegin;
84ca45077fSPaul Mullowney   switch (op) {
85e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
86e057df02SPaul Mullowney     cusparseMat->format = format;
87ca45077fSPaul Mullowney     break;
88e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
89e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
90ca45077fSPaul Mullowney     break;
91e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
92e057df02SPaul Mullowney     cusparseMat->format = format;
93e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
94ca45077fSPaul Mullowney     break;
95ca45077fSPaul Mullowney   default:
96e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
97ca45077fSPaul Mullowney   }
98ca45077fSPaul Mullowney   PetscFunctionReturn(0);
99ca45077fSPaul Mullowney }
100e057df02SPaul Mullowney EXTERN_C_END
1019ae82921SPaul Mullowney 
102e057df02SPaul Mullowney 
103e057df02SPaul Mullowney /*@
104e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
105e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
106*8468deeeSKarl Rupp    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
107e057df02SPaul Mullowney    to build/install PETSc to use this package.
108e057df02SPaul Mullowney 
109e057df02SPaul Mullowney    Not Collective
110e057df02SPaul Mullowney 
111e057df02SPaul Mullowney    Input Parameters:
112*8468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
113*8468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
114*8468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
115e057df02SPaul Mullowney 
116e057df02SPaul Mullowney    Output Parameter:
117e057df02SPaul Mullowney 
118e057df02SPaul Mullowney    Level: intermediate
119e057df02SPaul Mullowney 
120*8468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
121e057df02SPaul Mullowney @*/
122e057df02SPaul Mullowney #undef __FUNCT__
123e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
124e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125e057df02SPaul Mullowney {
126e057df02SPaul Mullowney   PetscErrorCode ierr;
1276e111a19SKarl Rupp 
128e057df02SPaul Mullowney   PetscFunctionBegin;
129e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
130e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
131e057df02SPaul Mullowney   PetscFunctionReturn(0);
132e057df02SPaul Mullowney }
133e057df02SPaul Mullowney 
1349ae82921SPaul Mullowney #undef __FUNCT__
1359ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1369ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1379ae82921SPaul Mullowney {
1389ae82921SPaul Mullowney   PetscErrorCode     ierr;
139e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1409ae82921SPaul Mullowney   PetscBool      flg;
1416e111a19SKarl Rupp 
1429ae82921SPaul Mullowney   PetscFunctionBegin;
143e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
144e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1459ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
146e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1477083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
148e057df02SPaul Mullowney     if (flg) {
149e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
150045c96e1SPaul Mullowney     }
1519ae82921SPaul Mullowney   }
1529ae82921SPaul Mullowney   else {
153e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1547083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
155e057df02SPaul Mullowney     if (flg) {
156e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
157e057df02SPaul Mullowney     }
1589ae82921SPaul Mullowney   }
1594c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1607083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1614c87dfd4SPaul Mullowney   if (flg) {
1624c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1634c87dfd4SPaul Mullowney   }
1649ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1659ae82921SPaul Mullowney   PetscFunctionReturn(0);
1669ae82921SPaul Mullowney 
1679ae82921SPaul Mullowney }
1689ae82921SPaul Mullowney 
1699ae82921SPaul Mullowney #undef __FUNCT__
1709ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1719ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1729ae82921SPaul Mullowney {
1739ae82921SPaul Mullowney   PetscErrorCode     ierr;
1749ae82921SPaul Mullowney 
1759ae82921SPaul Mullowney   PetscFunctionBegin;
1769ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1779ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1789ae82921SPaul Mullowney   PetscFunctionReturn(0);
1799ae82921SPaul Mullowney }
1809ae82921SPaul Mullowney 
1819ae82921SPaul Mullowney #undef __FUNCT__
1829ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1839ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1849ae82921SPaul Mullowney {
1859ae82921SPaul Mullowney   PetscErrorCode     ierr;
1869ae82921SPaul Mullowney 
1879ae82921SPaul Mullowney   PetscFunctionBegin;
1889ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1899ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1909ae82921SPaul Mullowney   PetscFunctionReturn(0);
1919ae82921SPaul Mullowney }
1929ae82921SPaul Mullowney 
1939ae82921SPaul Mullowney #undef __FUNCT__
194e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
195e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
1969ae82921SPaul Mullowney {
1979ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1989ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
1999ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2009ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
2019ae82921SPaul Mullowney   cusparseStatus_t stat;
2029ae82921SPaul Mullowney   const PetscInt    *ai = a->i,*aj = a->j,*vi;
2039ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2049ae82921SPaul Mullowney   PetscErrorCode     ierr;
2059ae82921SPaul Mullowney   PetscInt *AiLo, *AjLo;
2069ae82921SPaul Mullowney   PetscScalar *AALo;
2079ae82921SPaul Mullowney   PetscInt i,nz, nzLower, offset, rowOffset;
2089ae82921SPaul Mullowney 
2099ae82921SPaul Mullowney   PetscFunctionBegin;
2109ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2119ae82921SPaul Mullowney     try {
2129ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2139ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2149ae82921SPaul Mullowney 
2159ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2169ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2179ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2189ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2199ae82921SPaul Mullowney 
2209ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2219ae82921SPaul Mullowney       AiLo[0]=(PetscInt) 0;
2229ae82921SPaul Mullowney       AiLo[n]=nzLower;
2239ae82921SPaul Mullowney       AjLo[0]=(PetscInt) 0;
2249ae82921SPaul Mullowney       AALo[0]=(MatScalar) 1.0;
2259ae82921SPaul Mullowney       v    = aa;
2269ae82921SPaul Mullowney       vi   = aj;
2279ae82921SPaul Mullowney       offset=1;
2289ae82921SPaul Mullowney       rowOffset=1;
2299ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2309ae82921SPaul Mullowney         nz  = ai[i+1] - ai[i];
231e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2329ae82921SPaul Mullowney         AiLo[i]=rowOffset;
2339ae82921SPaul Mullowney         rowOffset+=nz+1;
2349ae82921SPaul Mullowney 
2359ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2369ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2379ae82921SPaul Mullowney 
2389ae82921SPaul Mullowney         offset+=nz;
2399ae82921SPaul Mullowney         AjLo[offset]=(PetscInt) i;
2409ae82921SPaul Mullowney         AALo[offset]=(MatScalar) 1.0;
2419ae82921SPaul Mullowney         offset+=1;
2429ae82921SPaul Mullowney 
2439ae82921SPaul Mullowney         v  += nz;
2449ae82921SPaul Mullowney         vi += nz;
2459ae82921SPaul Mullowney       }
246e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2479ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2489ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2499ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2509ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2519ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2529ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2539ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2549ae82921SPaul Mullowney     } catch(char* ex) {
2559ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2569ae82921SPaul Mullowney     }
2579ae82921SPaul Mullowney   }
2589ae82921SPaul Mullowney   PetscFunctionReturn(0);
2599ae82921SPaul Mullowney }
2609ae82921SPaul Mullowney 
2619ae82921SPaul Mullowney #undef __FUNCT__
262e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
263e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
2649ae82921SPaul Mullowney {
2659ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2669ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
2679ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2689ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2699ae82921SPaul Mullowney   cusparseStatus_t stat;
2709ae82921SPaul Mullowney   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
2719ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2729ae82921SPaul Mullowney   PetscInt *AiUp, *AjUp;
2739ae82921SPaul Mullowney   PetscScalar *AAUp;
2749ae82921SPaul Mullowney   PetscInt i,nz, nzUpper, offset;
2759ae82921SPaul Mullowney   PetscErrorCode     ierr;
2769ae82921SPaul Mullowney 
2779ae82921SPaul Mullowney   PetscFunctionBegin;
2789ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2799ae82921SPaul Mullowney     try {
2809ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
2819ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
2829ae82921SPaul Mullowney 
2839ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
2849ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2859ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
2869ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
2879ae82921SPaul Mullowney 
2889ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
2899ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
2909ae82921SPaul Mullowney       AiUp[n]=nzUpper;
2919ae82921SPaul Mullowney       offset = nzUpper;
2929ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
2939ae82921SPaul Mullowney         v   = aa + adiag[i+1] + 1;
2949ae82921SPaul Mullowney         vi  = aj + adiag[i+1] + 1;
2959ae82921SPaul Mullowney 
296e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
2979ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
2989ae82921SPaul Mullowney 
299e057df02SPaul Mullowney         /* decrement the offset */
3009ae82921SPaul Mullowney         offset -= (nz+1);
3019ae82921SPaul Mullowney 
302e057df02SPaul Mullowney         /* first, set the diagonal elements */
3039ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3049ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3059ae82921SPaul Mullowney         AiUp[i] = AiUp[i+1] - (nz+1);
3069ae82921SPaul Mullowney 
3079ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3089ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3099ae82921SPaul Mullowney       }
310e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3119ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3129ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3139ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3149ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3159ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3169ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3179ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3189ae82921SPaul Mullowney     } catch(char* ex) {
3199ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3209ae82921SPaul Mullowney     }
3219ae82921SPaul Mullowney   }
3229ae82921SPaul Mullowney   PetscFunctionReturn(0);
3239ae82921SPaul Mullowney }
3249ae82921SPaul Mullowney 
3259ae82921SPaul Mullowney #undef __FUNCT__
326a4f1b371SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
327e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
3289ae82921SPaul Mullowney {
3299ae82921SPaul Mullowney   PetscErrorCode     ierr;
3309ae82921SPaul Mullowney   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
3319ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3329ae82921SPaul Mullowney   IS               isrow = a->row,iscol = a->icol;
3339ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3349ae82921SPaul Mullowney   const PetscInt   *r,*c;
3359ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
3369ae82921SPaul Mullowney 
3379ae82921SPaul Mullowney   PetscFunctionBegin;
338e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
339e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3409ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3419ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3429ae82921SPaul Mullowney 
3439ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
344e057df02SPaul Mullowney   /*lower triangular indices */
3459ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3469ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3479ae82921SPaul Mullowney   if (!row_identity)
3489ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3499ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3509ae82921SPaul Mullowney 
351e057df02SPaul Mullowney   /*upper triangular indices */
3529ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3539ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3549ae82921SPaul Mullowney   if (!col_identity)
3559ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3569ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3579ae82921SPaul Mullowney   PetscFunctionReturn(0);
3589ae82921SPaul Mullowney }
3599ae82921SPaul Mullowney 
3609ae82921SPaul Mullowney #undef __FUNCT__
3619ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3629ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3639ae82921SPaul Mullowney {
3649ae82921SPaul Mullowney   PetscErrorCode   ierr;
3659ae82921SPaul Mullowney   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
3669ae82921SPaul Mullowney   IS               isrow = b->row,iscol = b->col;
3679ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3689ae82921SPaul Mullowney 
3699ae82921SPaul Mullowney   PetscFunctionBegin;
3709ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
371e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
3729ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3739ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3749ae82921SPaul Mullowney   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
3759ae82921SPaul Mullowney   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
3768dc1d2a3SPaul Mullowney 
377e057df02SPaul Mullowney   /* get the triangular factors */
378e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
3799ae82921SPaul Mullowney   PetscFunctionReturn(0);
3809ae82921SPaul Mullowney }
3819ae82921SPaul Mullowney 
3829ae82921SPaul Mullowney 
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney #undef __FUNCT__
3859ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
3869ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
3879ae82921SPaul Mullowney {
3889ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3899ae82921SPaul Mullowney   PetscErrorCode ierr;
390da80777bSKarl Rupp   CUSPARRAY      *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
3919ae82921SPaul Mullowney   cusparseStatus_t stat;
3929ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3939ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
3949ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
3959ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
3969ae82921SPaul Mullowney 
3979ae82921SPaul Mullowney   PetscFunctionBegin;
398e057df02SPaul Mullowney   /* Get the GPU pointers */
3999ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4009ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4019ae82921SPaul Mullowney 
402e057df02SPaul Mullowney   /* solve with reordering */
4039ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
4049ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
4059ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4069ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
4079ae82921SPaul Mullowney 
4089ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4099ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4109ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4119ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4129ae82921SPaul Mullowney   PetscFunctionReturn(0);
4139ae82921SPaul Mullowney }
4149ae82921SPaul Mullowney 
4159ae82921SPaul Mullowney 
4169ae82921SPaul Mullowney 
4179ae82921SPaul Mullowney 
4189ae82921SPaul Mullowney #undef __FUNCT__
4199ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
4209ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
4219ae82921SPaul Mullowney {
4229ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4239ae82921SPaul Mullowney   PetscErrorCode    ierr;
424da80777bSKarl Rupp   CUSPARRAY         *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
4259ae82921SPaul Mullowney   cusparseStatus_t stat;
4269ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4279ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4289ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4299ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
4309ae82921SPaul Mullowney 
4319ae82921SPaul Mullowney   PetscFunctionBegin;
432e057df02SPaul Mullowney   /* Get the GPU pointers */
4339ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4349ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4359ae82921SPaul Mullowney 
436e057df02SPaul Mullowney   /* solve */
4379ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
4389ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4399ae82921SPaul Mullowney 
4409ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4419ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4429ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4439ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4449ae82921SPaul Mullowney   PetscFunctionReturn(0);
4459ae82921SPaul Mullowney }
4469ae82921SPaul Mullowney 
4479ae82921SPaul Mullowney #undef __FUNCT__
448e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
449e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
4509ae82921SPaul Mullowney {
4519ae82921SPaul Mullowney 
4529ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
4539ae82921SPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
4549ae82921SPaul Mullowney   PetscInt        m           = A->rmap->n,*ii,*ridx;
4559ae82921SPaul Mullowney   PetscErrorCode  ierr;
4569ae82921SPaul Mullowney 
4579ae82921SPaul Mullowney 
4589ae82921SPaul Mullowney   PetscFunctionBegin;
4599ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
4609ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
4619ae82921SPaul Mullowney     /*
4629ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
4639ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
4649ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
4659ae82921SPaul Mullowney     */
4669ae82921SPaul Mullowney     if (cusparseMat->mat) {
4679ae82921SPaul Mullowney       try {
4689ae82921SPaul Mullowney         delete cusparseMat->mat;
4699ae82921SPaul Mullowney         if (cusparseMat->tempvec)
4709ae82921SPaul Mullowney           delete cusparseMat->tempvec;
4719ae82921SPaul Mullowney 
4729ae82921SPaul Mullowney       } catch(char* ex) {
4739ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4749ae82921SPaul Mullowney       }
4759ae82921SPaul Mullowney     }
4769ae82921SPaul Mullowney     try {
4779ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
4789ae82921SPaul Mullowney       for (int j = 0; j<m; j++)
4799ae82921SPaul Mullowney         cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
4809ae82921SPaul Mullowney 
4819ae82921SPaul Mullowney       if (a->compressedrow.use) {
4829ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
4839ae82921SPaul Mullowney         ii   = a->compressedrow.i;
4849ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
4859ae82921SPaul Mullowney       } else {
486e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
4879ae82921SPaul Mullowney         int k=0;
4889ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
4899ae82921SPaul Mullowney         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
4909ae82921SPaul Mullowney         ii[0]=0;
4919ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
4929ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
4939ae82921SPaul Mullowney             ii[k] = a->i[j];
4949ae82921SPaul Mullowney             ridx[k]= j;
4959ae82921SPaul Mullowney             k++;
4969ae82921SPaul Mullowney           }
4979ae82921SPaul Mullowney         }
4989ae82921SPaul Mullowney         ii[cusparseMat->nonzerorow] = a->nz;
4999ae82921SPaul Mullowney         m = cusparseMat->nonzerorow;
5009ae82921SPaul Mullowney       }
5019ae82921SPaul Mullowney 
502e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
503e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
5049ae82921SPaul Mullowney 
505e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
5069ae82921SPaul Mullowney       PetscMPIInt    size;
5079ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
5089ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
5099ae82921SPaul Mullowney 
510e057df02SPaul Mullowney       /* FILL MODE UPPER is irrelevant */
511ca45077fSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
512ca45077fSPaul Mullowney 
513e057df02SPaul Mullowney       /* lastly, build the matrix */
5149ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
5159ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
5169ae82921SPaul Mullowney       if (!a->compressedrow.use) {
5179ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
5189ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
5199ae82921SPaul Mullowney       }
5209ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
5219ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
5229ae82921SPaul Mullowney     } catch(char* ex) {
5239ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5249ae82921SPaul Mullowney     }
5259ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
5269ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
5279ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5289ae82921SPaul Mullowney   }
5299ae82921SPaul Mullowney   PetscFunctionReturn(0);
5309ae82921SPaul Mullowney }
5319ae82921SPaul Mullowney 
5329ae82921SPaul Mullowney #undef __FUNCT__
5339ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
5349ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
5359ae82921SPaul Mullowney {
5369ae82921SPaul Mullowney   PetscErrorCode ierr;
5379ae82921SPaul Mullowney 
5389ae82921SPaul Mullowney   PetscFunctionBegin;
5399ae82921SPaul Mullowney   if (right) {
5409ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
5419ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
5429ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
5439ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
5449ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
5459ae82921SPaul Mullowney   }
5469ae82921SPaul Mullowney   if (left) {
5479ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
5489ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
5499ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
5509ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
5519ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
5529ae82921SPaul Mullowney   }
5539ae82921SPaul Mullowney   PetscFunctionReturn(0);
5549ae82921SPaul Mullowney }
5559ae82921SPaul Mullowney 
5569ae82921SPaul Mullowney #undef __FUNCT__
5579ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
5589ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
5599ae82921SPaul Mullowney {
5609ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
5619ae82921SPaul Mullowney   PetscErrorCode ierr;
5629ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
563da80777bSKarl Rupp   CUSPARRAY      *xarray=PETSC_NULL,*yarray=PETSC_NULL;
5649ae82921SPaul Mullowney 
5659ae82921SPaul Mullowney   PetscFunctionBegin;
566e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
567e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
5689ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
5699ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
5709ae82921SPaul Mullowney   try {
5719ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
5729ae82921SPaul Mullowney   } catch (char* ex) {
5739ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5749ae82921SPaul Mullowney   }
5759ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
5769ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
577ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
5789ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
579ca45077fSPaul Mullowney   }
5809ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
5819ae82921SPaul Mullowney   PetscFunctionReturn(0);
5829ae82921SPaul Mullowney }
5839ae82921SPaul Mullowney 
5849ae82921SPaul Mullowney 
5859ae82921SPaul Mullowney #undef __FUNCT__
586ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
587ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
588ca45077fSPaul Mullowney {
589ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
590ca45077fSPaul Mullowney   PetscErrorCode ierr;
591ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
592da80777bSKarl Rupp   CUSPARRAY      *xarray=PETSC_NULL,*yarray=PETSC_NULL;
593ca45077fSPaul Mullowney 
594ca45077fSPaul Mullowney   PetscFunctionBegin;
595e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
596e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
597ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
598ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
599ca45077fSPaul Mullowney   try {
600ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
601ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
602ca45077fSPaul Mullowney #else
603ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
604ca45077fSPaul Mullowney #endif
605ca45077fSPaul Mullowney   } catch (char* ex) {
606ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
607ca45077fSPaul Mullowney   }
608ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
609ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
610ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
611ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
612ca45077fSPaul Mullowney   }
613ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
614ca45077fSPaul Mullowney   PetscFunctionReturn(0);
615ca45077fSPaul Mullowney }
616ca45077fSPaul Mullowney 
617ca45077fSPaul Mullowney #undef __FUNCT__
6189ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
6199ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
6209ae82921SPaul Mullowney {
6219ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
6229ae82921SPaul Mullowney   PetscErrorCode ierr;
6239ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
624da80777bSKarl Rupp   CUSPARRAY      *xarray=PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL;
6256e111a19SKarl Rupp 
6269ae82921SPaul Mullowney   PetscFunctionBegin;
627e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
628e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
6299ae82921SPaul Mullowney   try {
6309ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
6319ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
6329ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
6339ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
6349ae82921SPaul Mullowney 
635e057df02SPaul Mullowney     /* multiply add */
6369ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
6379ae82921SPaul Mullowney 
6389ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
6399ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
6409ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
6419ae82921SPaul Mullowney 
6429ae82921SPaul Mullowney   } catch(char* ex) {
6439ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6449ae82921SPaul Mullowney   }
6459ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
6469ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
6479ae82921SPaul Mullowney   PetscFunctionReturn(0);
6489ae82921SPaul Mullowney }
6499ae82921SPaul Mullowney 
6509ae82921SPaul Mullowney #undef __FUNCT__
651ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
652ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
653ca45077fSPaul Mullowney {
654ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
655ca45077fSPaul Mullowney   PetscErrorCode ierr;
656ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
657ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
6586e111a19SKarl Rupp 
659ca45077fSPaul Mullowney   PetscFunctionBegin;
660e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
661e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
662ca45077fSPaul Mullowney   try {
663ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
664ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
665ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
666ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
667ca45077fSPaul Mullowney 
668e057df02SPaul Mullowney     /* multiply add with matrix transpose */
669ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
670ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
671ca45077fSPaul Mullowney #else
672ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
673ca45077fSPaul Mullowney #endif
674ca45077fSPaul Mullowney 
675ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
676ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
677ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
678ca45077fSPaul Mullowney 
679ca45077fSPaul Mullowney   } catch(char* ex) {
680ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
681ca45077fSPaul Mullowney   }
682ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
683ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
684ca45077fSPaul Mullowney   PetscFunctionReturn(0);
685ca45077fSPaul Mullowney }
686ca45077fSPaul Mullowney 
687ca45077fSPaul Mullowney #undef __FUNCT__
6889ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
6899ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
6909ae82921SPaul Mullowney {
6919ae82921SPaul Mullowney   PetscErrorCode  ierr;
6926e111a19SKarl Rupp 
6939ae82921SPaul Mullowney   PetscFunctionBegin;
6949ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
695e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6969ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
697bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
698bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
699bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
700bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
7019ae82921SPaul Mullowney   PetscFunctionReturn(0);
7029ae82921SPaul Mullowney }
7039ae82921SPaul Mullowney 
7049ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
7059ae82921SPaul Mullowney #undef __FUNCT__
7069ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
707e057df02SPaul Mullowney /*@
7089ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
709e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
710e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
711e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
712e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
713e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7149ae82921SPaul Mullowney 
7159ae82921SPaul Mullowney    Collective on MPI_Comm
7169ae82921SPaul Mullowney 
7179ae82921SPaul Mullowney    Input Parameters:
7189ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
7199ae82921SPaul Mullowney .  m - number of rows
7209ae82921SPaul Mullowney .  n - number of columns
7219ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
7229ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7239ae82921SPaul Mullowney          (possibly different for each row) or PETSC_NULL
7249ae82921SPaul Mullowney 
7259ae82921SPaul Mullowney    Output Parameter:
7269ae82921SPaul Mullowney .  A - the matrix
7279ae82921SPaul Mullowney 
7289ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
7299ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
7309ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
7319ae82921SPaul Mullowney 
7329ae82921SPaul Mullowney    Notes:
7339ae82921SPaul Mullowney    If nnz is given then nz is ignored
7349ae82921SPaul Mullowney 
7359ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
7369ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
7379ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
7389ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
7399ae82921SPaul Mullowney 
7409ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7419ae82921SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
7429ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
7439ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
7449ae82921SPaul Mullowney 
7459ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
7469ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
7479ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
7489ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
7499ae82921SPaul Mullowney 
7509ae82921SPaul Mullowney    Level: intermediate
7519ae82921SPaul Mullowney 
752e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
7539ae82921SPaul Mullowney @*/
7549ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
7559ae82921SPaul Mullowney {
7569ae82921SPaul Mullowney   PetscErrorCode ierr;
7579ae82921SPaul Mullowney 
7589ae82921SPaul Mullowney   PetscFunctionBegin;
7599ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7609ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
7619ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
7629ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
7639ae82921SPaul Mullowney   PetscFunctionReturn(0);
7649ae82921SPaul Mullowney }
7659ae82921SPaul Mullowney 
7669ae82921SPaul Mullowney #undef __FUNCT__
7679ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
7689ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
7699ae82921SPaul Mullowney {
7709ae82921SPaul Mullowney   PetscErrorCode        ierr;
7719ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
7729ae82921SPaul Mullowney 
7739ae82921SPaul Mullowney   PetscFunctionBegin;
7749ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
7759ae82921SPaul Mullowney     try {
7769ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
7779ae82921SPaul Mullowney         delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
7789ae82921SPaul Mullowney       }
7799ae82921SPaul Mullowney       if (cusparseMat->tempvec!=0)
7809ae82921SPaul Mullowney         delete cusparseMat->tempvec;
7819ae82921SPaul Mullowney       delete cusparseMat;
7829ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
7839ae82921SPaul Mullowney     } catch(char* ex) {
7849ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7859ae82921SPaul Mullowney     }
7869ae82921SPaul Mullowney   } else {
787e057df02SPaul Mullowney     /* The triangular factors */
7889ae82921SPaul Mullowney     try {
7899ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7909ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
7919ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
7929ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
7939ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
7949ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
7959ae82921SPaul Mullowney       delete cusparseTriFactors;
7969ae82921SPaul Mullowney     } catch(char* ex) {
7979ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7989ae82921SPaul Mullowney     }
7999ae82921SPaul Mullowney   }
8009ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
8019ae82921SPaul Mullowney     cusparseStatus_t stat;
8029ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
8039ae82921SPaul Mullowney     MAT_cusparseHandle=0;
8049ae82921SPaul Mullowney   }
8059ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
8069ae82921SPaul Mullowney   A->spptr = 0;
8079ae82921SPaul Mullowney 
8089ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
8099ae82921SPaul Mullowney   PetscFunctionReturn(0);
8109ae82921SPaul Mullowney }
8119ae82921SPaul Mullowney 
8129ae82921SPaul Mullowney EXTERN_C_BEGIN
8139ae82921SPaul Mullowney #undef __FUNCT__
8149ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
8159ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
8169ae82921SPaul Mullowney {
8179ae82921SPaul Mullowney   PetscErrorCode ierr;
8189ae82921SPaul Mullowney 
8199ae82921SPaul Mullowney   PetscFunctionBegin;
8209ae82921SPaul Mullowney   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
8219ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
822e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
823e057df02SPaul Mullowney        now build a GPU matrix data structure */
8249ae82921SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSE;
8259ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
8269ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
827e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR;
8289ae82921SPaul Mullowney   } else {
8299ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
830debe9ee2SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
8319ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
8329ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
8339ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
834e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat;
8359ae82921SPaul Mullowney   }
836e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
8379ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
8389ae82921SPaul Mullowney     cusparseStatus_t stat;
8399ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
8409ae82921SPaul Mullowney   }
841e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
842e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
843e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
8449ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
8459ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
8469ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
8479ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
8489ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
849ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
850ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
851ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
852ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
8539ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8549ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
855e057df02SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
8569ae82921SPaul Mullowney   PetscFunctionReturn(0);
8579ae82921SPaul Mullowney }
8589ae82921SPaul Mullowney EXTERN_C_END
8599ae82921SPaul Mullowney 
860e057df02SPaul Mullowney /*M
861e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
862e057df02SPaul Mullowney 
863e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
864e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
865e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
866e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
867e057df02SPaul Mullowney    the different GPU storage formats.
868e057df02SPaul Mullowney 
869e057df02SPaul Mullowney    Options Database Keys:
870e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
871*8468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
872*8468deeeSKarl Rupp .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
873*8468deeeSKarl Rupp -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
874e057df02SPaul Mullowney 
875e057df02SPaul Mullowney   Level: beginner
876e057df02SPaul Mullowney 
877*8468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
878e057df02SPaul Mullowney M*/
879