xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4c87dfd424bd2c59d26c3686d7bb913984f46822)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney     Defines the basic matrix operations for the AIJ (compressed row)
39ae82921SPaul Mullowney   matrix storage format.
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN
89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
99ae82921SPaul Mullowney //#include "petscbt.h"
109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END
139ae82921SPaul Mullowney #undef VecType
149ae82921SPaul Mullowney #include "cusparsematimpl.h"
15e057df02SPaul Mullowney const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
169ae82921SPaul Mullowney 
17e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable
18e057df02SPaul Mullowney    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19e057df02SPaul Mullowney    SpMV. Essentially, I need to use the same stream variable in two different
20e057df02SPaul Mullowney    data structures. I do this by creating a single instance of that stream
21e057df02SPaul Mullowney    and reuse it. */
22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0;
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
359ae82921SPaul Mullowney 
369ae82921SPaul Mullowney #undef __FUNCT__
379ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
389ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
399ae82921SPaul Mullowney {
409ae82921SPaul Mullowney   PetscFunctionBegin;
419ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
429ae82921SPaul Mullowney   PetscFunctionReturn(0);
439ae82921SPaul Mullowney }
449ae82921SPaul Mullowney 
459ae82921SPaul Mullowney EXTERN_C_BEGIN
469ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
479ae82921SPaul Mullowney EXTERN_C_END
48e057df02SPaul Mullowney /*
499ae82921SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
509ae82921SPaul Mullowney   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
519ae82921SPaul Mullowney 
529ae82921SPaul Mullowney    Level: beginner
53e057df02SPaul Mullowney */
549ae82921SPaul Mullowney 
559ae82921SPaul Mullowney EXTERN_C_BEGIN
569ae82921SPaul Mullowney #undef __FUNCT__
579ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
589ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
599ae82921SPaul Mullowney {
609ae82921SPaul Mullowney   PetscErrorCode     ierr;
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   PetscFunctionBegin;
639ae82921SPaul Mullowney   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
649ae82921SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
659ae82921SPaul Mullowney     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
669ae82921SPaul Mullowney     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
679ae82921SPaul Mullowney     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
689ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
699ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
709ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
719ae82921SPaul Mullowney   (*B)->factortype = ftype;
729ae82921SPaul Mullowney   PetscFunctionReturn(0);
739ae82921SPaul Mullowney }
749ae82921SPaul Mullowney EXTERN_C_END
759ae82921SPaul Mullowney 
76e057df02SPaul Mullowney EXTERN_C_BEGIN
779ae82921SPaul Mullowney #undef __FUNCT__
78e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
79e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
80ca45077fSPaul Mullowney {
81ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
82ca45077fSPaul Mullowney   PetscFunctionBegin;
83ca45077fSPaul Mullowney   switch (op) {
84e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
85e057df02SPaul Mullowney     cusparseMat->format = format;
86ca45077fSPaul Mullowney     break;
87e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
88e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
89ca45077fSPaul Mullowney     break;
90e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
91e057df02SPaul Mullowney     cusparseMat->format = format;
92e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
93ca45077fSPaul Mullowney     break;
94ca45077fSPaul Mullowney   default:
95e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
96ca45077fSPaul Mullowney   }
97ca45077fSPaul Mullowney   PetscFunctionReturn(0);
98ca45077fSPaul Mullowney }
99e057df02SPaul Mullowney EXTERN_C_END
1009ae82921SPaul Mullowney 
101e057df02SPaul Mullowney 
102e057df02SPaul Mullowney /*@
103e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
104e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
105e057df02SPaul Mullowney    for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
106e057df02SPaul Mullowney    to build/install PETSc to use this package.
107e057df02SPaul Mullowney 
108e057df02SPaul Mullowney    Not Collective
109e057df02SPaul Mullowney 
110e057df02SPaul Mullowney    Input Parameters:
111e057df02SPaul Mullowney +  A : Matrix of type SEQAIJCUSPARSE
112e057df02SPaul Mullowney .  op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
113e057df02SPaul Mullowney -  format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
114e057df02SPaul Mullowney 
115e057df02SPaul Mullowney    Output Parameter:
116e057df02SPaul Mullowney 
117e057df02SPaul Mullowney    Level: intermediate
118e057df02SPaul Mullowney 
119e057df02SPaul Mullowney .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation
120e057df02SPaul Mullowney @*/
121e057df02SPaul Mullowney #undef __FUNCT__
122e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
123e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124e057df02SPaul Mullowney {
125e057df02SPaul Mullowney   PetscErrorCode ierr;
126e057df02SPaul Mullowney   PetscFunctionBegin;
127e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
128e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
129e057df02SPaul Mullowney   PetscFunctionReturn(0);
130e057df02SPaul Mullowney }
131e057df02SPaul Mullowney 
1329ae82921SPaul Mullowney #undef __FUNCT__
1339ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1349ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscErrorCode     ierr;
137e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1389ae82921SPaul Mullowney   PetscBool      flg;
1399ae82921SPaul Mullowney   PetscFunctionBegin;
140e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
141e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1429ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
143e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
144e057df02SPaul Mullowney 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
145e057df02SPaul Mullowney     if (flg) {
146e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
147045c96e1SPaul Mullowney     }
1489ae82921SPaul Mullowney   }
1499ae82921SPaul Mullowney   else {
150e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
151e057df02SPaul Mullowney 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
152e057df02SPaul Mullowney     if (flg) {
153e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
154e057df02SPaul Mullowney     }
1559ae82921SPaul Mullowney   }
156*4c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
157*4c87dfd4SPaul Mullowney                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
158*4c87dfd4SPaul Mullowney   if (flg) {
159*4c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
160*4c87dfd4SPaul Mullowney   }
1619ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1629ae82921SPaul Mullowney   PetscFunctionReturn(0);
1639ae82921SPaul Mullowney 
1649ae82921SPaul Mullowney }
1659ae82921SPaul Mullowney 
1669ae82921SPaul Mullowney #undef __FUNCT__
1679ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1689ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1699ae82921SPaul Mullowney {
1709ae82921SPaul Mullowney   PetscErrorCode     ierr;
1719ae82921SPaul Mullowney 
1729ae82921SPaul Mullowney   PetscFunctionBegin;
1739ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1749ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1759ae82921SPaul Mullowney   PetscFunctionReturn(0);
1769ae82921SPaul Mullowney }
1779ae82921SPaul Mullowney 
1789ae82921SPaul Mullowney #undef __FUNCT__
1799ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1809ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1819ae82921SPaul Mullowney {
1829ae82921SPaul Mullowney   PetscErrorCode     ierr;
1839ae82921SPaul Mullowney 
1849ae82921SPaul Mullowney   PetscFunctionBegin;
1859ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1869ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1879ae82921SPaul Mullowney   PetscFunctionReturn(0);
1889ae82921SPaul Mullowney }
1899ae82921SPaul Mullowney 
1909ae82921SPaul Mullowney #undef __FUNCT__
191e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
192e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
1939ae82921SPaul Mullowney {
1949ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1959ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
1969ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1979ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
1989ae82921SPaul Mullowney   cusparseStatus_t stat;
1999ae82921SPaul Mullowney   const PetscInt    *ai = a->i,*aj = a->j,*vi;
2009ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2019ae82921SPaul Mullowney   PetscErrorCode     ierr;
2029ae82921SPaul Mullowney   PetscInt *AiLo, *AjLo;
2039ae82921SPaul Mullowney   PetscScalar *AALo;
2049ae82921SPaul Mullowney   PetscInt i,nz, nzLower, offset, rowOffset;
2059ae82921SPaul Mullowney 
2069ae82921SPaul Mullowney   PetscFunctionBegin;
2079ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
2089ae82921SPaul Mullowney     try {
2099ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2109ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2119ae82921SPaul Mullowney 
2129ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2139ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2149ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2159ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2169ae82921SPaul Mullowney 
2179ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2189ae82921SPaul Mullowney       AiLo[0]=(PetscInt) 0;
2199ae82921SPaul Mullowney       AiLo[n]=nzLower;
2209ae82921SPaul Mullowney       AjLo[0]=(PetscInt) 0;
2219ae82921SPaul Mullowney       AALo[0]=(MatScalar) 1.0;
2229ae82921SPaul Mullowney       v    = aa;
2239ae82921SPaul Mullowney       vi   = aj;
2249ae82921SPaul Mullowney       offset=1;
2259ae82921SPaul Mullowney       rowOffset=1;
2269ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2279ae82921SPaul Mullowney 	nz  = ai[i+1] - ai[i];
228e057df02SPaul Mullowney 	/* additional 1 for the term on the diagonal */
2299ae82921SPaul Mullowney 	AiLo[i]=rowOffset;
2309ae82921SPaul Mullowney 	rowOffset+=nz+1;
2319ae82921SPaul Mullowney 
2329ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2339ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2349ae82921SPaul Mullowney 
2359ae82921SPaul Mullowney 	offset+=nz;
2369ae82921SPaul Mullowney 	AjLo[offset]=(PetscInt) i;
2379ae82921SPaul Mullowney 	AALo[offset]=(MatScalar) 1.0;
2389ae82921SPaul Mullowney 	offset+=1;
2399ae82921SPaul Mullowney 
2409ae82921SPaul Mullowney 	v  += nz;
2419ae82921SPaul Mullowney 	vi += nz;
2429ae82921SPaul Mullowney       }
243e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
2449ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
2459ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
2469ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
2479ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
2489ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
2499ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
2509ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
2519ae82921SPaul Mullowney     } catch(char* ex) {
2529ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2539ae82921SPaul Mullowney     }
2549ae82921SPaul Mullowney   }
2559ae82921SPaul Mullowney   PetscFunctionReturn(0);
2569ae82921SPaul Mullowney }
2579ae82921SPaul Mullowney 
2589ae82921SPaul Mullowney #undef __FUNCT__
259e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
260e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
2619ae82921SPaul Mullowney {
2629ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2639ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
2649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2659ae82921SPaul Mullowney   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
2669ae82921SPaul Mullowney   cusparseStatus_t stat;
2679ae82921SPaul Mullowney   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
2689ae82921SPaul Mullowney   const MatScalar   *aa = a->a,*v;
2699ae82921SPaul Mullowney   PetscInt *AiUp, *AjUp;
2709ae82921SPaul Mullowney   PetscScalar *AAUp;
2719ae82921SPaul Mullowney   PetscInt i,nz, nzUpper, offset;
2729ae82921SPaul Mullowney   PetscErrorCode     ierr;
2739ae82921SPaul Mullowney 
2749ae82921SPaul Mullowney   PetscFunctionBegin;
2759ae82921SPaul Mullowney 
2769ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
2779ae82921SPaul Mullowney     try {
2789ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
2799ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
2809ae82921SPaul Mullowney 
2819ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
2829ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2839ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
2849ae82921SPaul Mullowney       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
2859ae82921SPaul Mullowney 
2869ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
2879ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
2889ae82921SPaul Mullowney       AiUp[n]=nzUpper;
2899ae82921SPaul Mullowney       offset = nzUpper;
2909ae82921SPaul Mullowney       for (i=n-1; i>=0; i--){
2919ae82921SPaul Mullowney 	v   = aa + adiag[i+1] + 1;
2929ae82921SPaul Mullowney 	vi  = aj + adiag[i+1] + 1;
2939ae82921SPaul Mullowney 
294e057df02SPaul Mullowney 	/* number of elements NOT on the diagonal */
2959ae82921SPaul Mullowney 	nz = adiag[i] - adiag[i+1]-1;
2969ae82921SPaul Mullowney 
297e057df02SPaul Mullowney 	/* decrement the offset */
2989ae82921SPaul Mullowney 	offset -= (nz+1);
2999ae82921SPaul Mullowney 
300e057df02SPaul Mullowney 	/* first, set the diagonal elements */
3019ae82921SPaul Mullowney 	AjUp[offset] = (PetscInt) i;
3029ae82921SPaul Mullowney 	AAUp[offset] = 1./v[nz];
3039ae82921SPaul Mullowney 	AiUp[i] = AiUp[i+1] - (nz+1);
3049ae82921SPaul Mullowney 
3059ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3069ae82921SPaul Mullowney 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3079ae82921SPaul Mullowney       }
308e057df02SPaul Mullowney       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
3099ae82921SPaul Mullowney       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
3109ae82921SPaul Mullowney       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
3119ae82921SPaul Mullowney       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
3129ae82921SPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
3139ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
3149ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
3159ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
3169ae82921SPaul Mullowney     } catch(char* ex) {
3179ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3189ae82921SPaul Mullowney     }
3199ae82921SPaul Mullowney   }
3209ae82921SPaul Mullowney   PetscFunctionReturn(0);
3219ae82921SPaul Mullowney }
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney #undef __FUNCT__
324e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysiAndCopyToGPU"
325e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
3269ae82921SPaul Mullowney {
3279ae82921SPaul Mullowney   PetscErrorCode     ierr;
3289ae82921SPaul Mullowney   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
3299ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3309ae82921SPaul Mullowney   IS               isrow = a->row,iscol = a->icol;
3319ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3329ae82921SPaul Mullowney   const PetscInt   *r,*c;
3339ae82921SPaul Mullowney   PetscInt          n = A->rmap->n;
3349ae82921SPaul Mullowney 
3359ae82921SPaul Mullowney   PetscFunctionBegin;
336e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
337e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   cusparseTriFactors->tempvec = new CUSPARRAY;
3399ae82921SPaul Mullowney   cusparseTriFactors->tempvec->resize(n);
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
342e057df02SPaul Mullowney   /*lower triangular indices */
3439ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3449ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3459ae82921SPaul Mullowney   if (!row_identity)
3469ae82921SPaul Mullowney     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
3479ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3489ae82921SPaul Mullowney 
349e057df02SPaul Mullowney   /*upper triangular indices */
3509ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
3519ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3529ae82921SPaul Mullowney   if (!col_identity)
3539ae82921SPaul Mullowney     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
3549ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
3559ae82921SPaul Mullowney   PetscFunctionReturn(0);
3569ae82921SPaul Mullowney }
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney #undef __FUNCT__
3599ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
3609ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
3619ae82921SPaul Mullowney {
3629ae82921SPaul Mullowney   PetscErrorCode   ierr;
3639ae82921SPaul Mullowney   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
3649ae82921SPaul Mullowney   IS               isrow = b->row,iscol = b->col;
3659ae82921SPaul Mullowney   PetscBool        row_identity,col_identity;
3669ae82921SPaul Mullowney 
3679ae82921SPaul Mullowney   PetscFunctionBegin;
3689ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
369e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
3709ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3719ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3729ae82921SPaul Mullowney   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
3739ae82921SPaul Mullowney   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
3748dc1d2a3SPaul Mullowney 
375e057df02SPaul Mullowney   /* get the triangular factors */
376e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
3779ae82921SPaul Mullowney   PetscFunctionReturn(0);
3789ae82921SPaul Mullowney }
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney 
3819ae82921SPaul Mullowney 
3829ae82921SPaul Mullowney #undef __FUNCT__
3839ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
3849ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
3859ae82921SPaul Mullowney {
3869ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3879ae82921SPaul Mullowney   PetscErrorCode ierr;
3889ae82921SPaul Mullowney   CUSPARRAY      *xGPU, *bGPU;
3899ae82921SPaul Mullowney   cusparseStatus_t stat;
3909ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
3919ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
3929ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
3939ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney   PetscFunctionBegin;
396e057df02SPaul Mullowney   /* Get the GPU pointers */
3979ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
3989ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
3999ae82921SPaul Mullowney 
400e057df02SPaul Mullowney   /* solve with reordering */
4019ae82921SPaul Mullowney   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
4029ae82921SPaul Mullowney   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
4039ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4049ae82921SPaul Mullowney   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4079ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4089ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4099ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4109ae82921SPaul Mullowney   PetscFunctionReturn(0);
4119ae82921SPaul Mullowney }
4129ae82921SPaul Mullowney 
4139ae82921SPaul Mullowney 
4149ae82921SPaul Mullowney 
4159ae82921SPaul Mullowney 
4169ae82921SPaul Mullowney #undef __FUNCT__
4179ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
4189ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
4199ae82921SPaul Mullowney {
4209ae82921SPaul Mullowney   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4219ae82921SPaul Mullowney   PetscErrorCode    ierr;
4229ae82921SPaul Mullowney   CUSPARRAY         *xGPU, *bGPU;
4239ae82921SPaul Mullowney   cusparseStatus_t stat;
4249ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4259ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
4269ae82921SPaul Mullowney   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
4279ae82921SPaul Mullowney   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
4289ae82921SPaul Mullowney 
4299ae82921SPaul Mullowney   PetscFunctionBegin;
430e057df02SPaul Mullowney   /* Get the GPU pointers */
4319ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4329ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
4339ae82921SPaul Mullowney 
434e057df02SPaul Mullowney   /* solve */
4359ae82921SPaul Mullowney   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
4369ae82921SPaul Mullowney   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
4379ae82921SPaul Mullowney 
4389ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
4399ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
4409ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
4419ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
4429ae82921SPaul Mullowney   PetscFunctionReturn(0);
4439ae82921SPaul Mullowney }
4449ae82921SPaul Mullowney 
4459ae82921SPaul Mullowney #undef __FUNCT__
446e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
447e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
4489ae82921SPaul Mullowney {
4499ae82921SPaul Mullowney 
4509ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
4519ae82921SPaul Mullowney   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
4529ae82921SPaul Mullowney   PetscInt        m           = A->rmap->n,*ii,*ridx;
4539ae82921SPaul Mullowney   PetscErrorCode  ierr;
4549ae82921SPaul Mullowney 
4559ae82921SPaul Mullowney 
4569ae82921SPaul Mullowney   PetscFunctionBegin;
4579ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
4589ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
4599ae82921SPaul Mullowney     /*
4609ae82921SPaul Mullowney       It may be possible to reuse nonzero structure with new matrix values but
4619ae82921SPaul Mullowney       for simplicity and insured correctness we delete and build a new matrix on
4629ae82921SPaul Mullowney       the GPU. Likely a very small performance hit.
4639ae82921SPaul Mullowney     */
4649ae82921SPaul Mullowney     if (cusparseMat->mat){
4659ae82921SPaul Mullowney       try {
4669ae82921SPaul Mullowney 	delete cusparseMat->mat;
4679ae82921SPaul Mullowney 	if (cusparseMat->tempvec)
4689ae82921SPaul Mullowney 	  delete cusparseMat->tempvec;
4699ae82921SPaul Mullowney 
4709ae82921SPaul Mullowney       } catch(char* ex) {
4719ae82921SPaul Mullowney 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4729ae82921SPaul Mullowney       }
4739ae82921SPaul Mullowney     }
4749ae82921SPaul Mullowney     try {
4759ae82921SPaul Mullowney       cusparseMat->nonzerorow=0;
4769ae82921SPaul Mullowney       for (int j = 0; j<m; j++)
4779ae82921SPaul Mullowney 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
4789ae82921SPaul Mullowney 
4799ae82921SPaul Mullowney       if (a->compressedrow.use) {
4809ae82921SPaul Mullowney 	m    = a->compressedrow.nrows;
4819ae82921SPaul Mullowney 	ii   = a->compressedrow.i;
4829ae82921SPaul Mullowney 	ridx = a->compressedrow.rindex;
4839ae82921SPaul Mullowney       } else {
484e057df02SPaul Mullowney 	/* Forcing compressed row on the GPU ... only relevant for CSR storage */
4859ae82921SPaul Mullowney 	int k=0;
4869ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
4879ae82921SPaul Mullowney 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
4889ae82921SPaul Mullowney 	ii[0]=0;
4899ae82921SPaul Mullowney 	for (int j = 0; j<m; j++) {
4909ae82921SPaul Mullowney 	  if ((a->i[j+1]-a->i[j])>0) {
4919ae82921SPaul Mullowney 	    ii[k] = a->i[j];
4929ae82921SPaul Mullowney 	    ridx[k]= j;
4939ae82921SPaul Mullowney 	    k++;
4949ae82921SPaul Mullowney 	  }
4959ae82921SPaul Mullowney 	}
4969ae82921SPaul Mullowney 	ii[cusparseMat->nonzerorow] = a->nz;
4979ae82921SPaul Mullowney 	m = cusparseMat->nonzerorow;
4989ae82921SPaul Mullowney       }
4999ae82921SPaul Mullowney 
500e057df02SPaul Mullowney       /* Build our matrix ... first determine the GPU storage type */
501e057df02SPaul Mullowney       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
5029ae82921SPaul Mullowney 
503e057df02SPaul Mullowney       /* Create the streams and events (if desired).  */
5049ae82921SPaul Mullowney       PetscMPIInt    size;
5059ae82921SPaul Mullowney       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
5069ae82921SPaul Mullowney       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
5079ae82921SPaul Mullowney 
508e057df02SPaul Mullowney       /* FILL MODE UPPER is irrelevant */
509ca45077fSPaul Mullowney       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
510ca45077fSPaul Mullowney 
511e057df02SPaul Mullowney       /* lastly, build the matrix */
5129ae82921SPaul Mullowney       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
5139ae82921SPaul Mullowney       cusparseMat->mat->setCPRowIndices(ridx, m);
5149ae82921SPaul Mullowney       if (!a->compressedrow.use) {
5159ae82921SPaul Mullowney 	ierr = PetscFree(ii);CHKERRQ(ierr);
5169ae82921SPaul Mullowney 	ierr = PetscFree(ridx);CHKERRQ(ierr);
5179ae82921SPaul Mullowney       }
5189ae82921SPaul Mullowney       cusparseMat->tempvec = new CUSPARRAY;
5199ae82921SPaul Mullowney       cusparseMat->tempvec->resize(m);
5209ae82921SPaul Mullowney     } catch(char* ex) {
5219ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5229ae82921SPaul Mullowney     }
5239ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
5249ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
5259ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
5269ae82921SPaul Mullowney   }
5279ae82921SPaul Mullowney   PetscFunctionReturn(0);
5289ae82921SPaul Mullowney }
5299ae82921SPaul Mullowney 
5309ae82921SPaul Mullowney #undef __FUNCT__
5319ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
5329ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
5339ae82921SPaul Mullowney {
5349ae82921SPaul Mullowney   PetscErrorCode ierr;
5359ae82921SPaul Mullowney 
5369ae82921SPaul Mullowney   PetscFunctionBegin;
5379ae82921SPaul Mullowney 
5389ae82921SPaul Mullowney   if (right) {
5399ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
5409ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
5419ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
5429ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
5439ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
5449ae82921SPaul Mullowney   }
5459ae82921SPaul Mullowney   if (left) {
5469ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
5479ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
5489ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
5499ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
5509ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
5519ae82921SPaul Mullowney   }
5529ae82921SPaul Mullowney   PetscFunctionReturn(0);
5539ae82921SPaul Mullowney }
5549ae82921SPaul Mullowney 
5559ae82921SPaul Mullowney #undef __FUNCT__
5569ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
5579ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
5589ae82921SPaul Mullowney {
5599ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
5609ae82921SPaul Mullowney   PetscErrorCode ierr;
5619ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
5629ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray;
5639ae82921SPaul Mullowney 
5649ae82921SPaul Mullowney   PetscFunctionBegin;
565e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
566e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
5679ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
5689ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
5699ae82921SPaul Mullowney   try {
5709ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
5719ae82921SPaul Mullowney   } catch (char* ex) {
5729ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5739ae82921SPaul Mullowney   }
5749ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
5759ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
576ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
5779ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
578ca45077fSPaul Mullowney   }
5799ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
5809ae82921SPaul Mullowney   PetscFunctionReturn(0);
5819ae82921SPaul Mullowney }
5829ae82921SPaul Mullowney 
5839ae82921SPaul Mullowney 
5849ae82921SPaul Mullowney #undef __FUNCT__
585ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
586ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
587ca45077fSPaul Mullowney {
588ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
589ca45077fSPaul Mullowney   PetscErrorCode ierr;
590ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
591ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray;
592ca45077fSPaul Mullowney 
593ca45077fSPaul Mullowney   PetscFunctionBegin;
594e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
595e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
596ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
597ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
598ca45077fSPaul Mullowney   try {
599ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
600ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
601ca45077fSPaul Mullowney #else
602ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
603ca45077fSPaul Mullowney #endif
604ca45077fSPaul Mullowney   } catch (char* ex) {
605ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
606ca45077fSPaul Mullowney   }
607ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
608ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
609ca45077fSPaul Mullowney   if (!cusparseMat->mat->hasNonZeroStream()) {
610ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
611ca45077fSPaul Mullowney   }
612ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
613ca45077fSPaul Mullowney   PetscFunctionReturn(0);
614ca45077fSPaul Mullowney }
615ca45077fSPaul Mullowney 
616ca45077fSPaul Mullowney #undef __FUNCT__
6179ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
6189ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
6199ae82921SPaul Mullowney {
6209ae82921SPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
6219ae82921SPaul Mullowney   PetscErrorCode ierr;
6229ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
6239ae82921SPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
6249ae82921SPaul Mullowney   PetscFunctionBegin;
625e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
626e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
6279ae82921SPaul Mullowney   try {
6289ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
6299ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
6309ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
6319ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
6329ae82921SPaul Mullowney 
633e057df02SPaul Mullowney     /* multiply add */
6349ae82921SPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
6359ae82921SPaul Mullowney 
6369ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
6379ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
6389ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
6399ae82921SPaul Mullowney 
6409ae82921SPaul Mullowney   } catch(char* ex) {
6419ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6429ae82921SPaul Mullowney   }
6439ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
6449ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
6459ae82921SPaul Mullowney   PetscFunctionReturn(0);
6469ae82921SPaul Mullowney }
6479ae82921SPaul Mullowney 
6489ae82921SPaul Mullowney #undef __FUNCT__
649ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
650ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
651ca45077fSPaul Mullowney {
652ca45077fSPaul Mullowney   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
653ca45077fSPaul Mullowney   PetscErrorCode ierr;
654ca45077fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
655ca45077fSPaul Mullowney   CUSPARRAY      *xarray,*yarray,*zarray;
656ca45077fSPaul Mullowney   PetscFunctionBegin;
657e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
658e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
659ca45077fSPaul Mullowney   try {
660ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
661ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
662ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
663ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
664ca45077fSPaul Mullowney 
665e057df02SPaul Mullowney     /* multiply add with matrix transpose */
666ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX)
667ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
668ca45077fSPaul Mullowney #else
669ca45077fSPaul Mullowney     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
670ca45077fSPaul Mullowney #endif
671ca45077fSPaul Mullowney 
672ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
673ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
674ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
675ca45077fSPaul Mullowney 
676ca45077fSPaul Mullowney   } catch(char* ex) {
677ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
678ca45077fSPaul Mullowney   }
679ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
680ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
681ca45077fSPaul Mullowney   PetscFunctionReturn(0);
682ca45077fSPaul Mullowney }
683ca45077fSPaul Mullowney 
684ca45077fSPaul Mullowney #undef __FUNCT__
6859ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
6869ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
6879ae82921SPaul Mullowney {
6889ae82921SPaul Mullowney   PetscErrorCode  ierr;
6899ae82921SPaul Mullowney   PetscFunctionBegin;
6909ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
691e057df02SPaul Mullowney   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6929ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
693bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
694bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
695bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
696bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
6979ae82921SPaul Mullowney   PetscFunctionReturn(0);
6989ae82921SPaul Mullowney }
6999ae82921SPaul Mullowney 
7009ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
7019ae82921SPaul Mullowney #undef __FUNCT__
7029ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
703e057df02SPaul Mullowney /*@
7049ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
705e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
706e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
707e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
708e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
709e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7109ae82921SPaul Mullowney 
7119ae82921SPaul Mullowney    Collective on MPI_Comm
7129ae82921SPaul Mullowney 
7139ae82921SPaul Mullowney    Input Parameters:
7149ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
7159ae82921SPaul Mullowney .  m - number of rows
7169ae82921SPaul Mullowney .  n - number of columns
7179ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
7189ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7199ae82921SPaul Mullowney          (possibly different for each row) or PETSC_NULL
7209ae82921SPaul Mullowney 
7219ae82921SPaul Mullowney    Output Parameter:
7229ae82921SPaul Mullowney .  A - the matrix
7239ae82921SPaul Mullowney 
7249ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
7259ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
7269ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
7279ae82921SPaul Mullowney 
7289ae82921SPaul Mullowney    Notes:
7299ae82921SPaul Mullowney    If nnz is given then nz is ignored
7309ae82921SPaul Mullowney 
7319ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
7329ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
7339ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
7349ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
7359ae82921SPaul Mullowney 
7369ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7379ae82921SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
7389ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
7399ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
7409ae82921SPaul Mullowney 
7419ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
7429ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
7439ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
7449ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
7459ae82921SPaul Mullowney 
7469ae82921SPaul Mullowney    Level: intermediate
7479ae82921SPaul Mullowney 
748e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
7499ae82921SPaul Mullowney @*/
7509ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
7519ae82921SPaul Mullowney {
7529ae82921SPaul Mullowney   PetscErrorCode ierr;
7539ae82921SPaul Mullowney 
7549ae82921SPaul Mullowney   PetscFunctionBegin;
7559ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7569ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
7579ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
7589ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
7599ae82921SPaul Mullowney   PetscFunctionReturn(0);
7609ae82921SPaul Mullowney }
7619ae82921SPaul Mullowney 
7629ae82921SPaul Mullowney #undef __FUNCT__
7639ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
7649ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
7659ae82921SPaul Mullowney {
7669ae82921SPaul Mullowney   PetscErrorCode        ierr;
7679ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
7689ae82921SPaul Mullowney 
7699ae82921SPaul Mullowney   PetscFunctionBegin;
7709ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
7719ae82921SPaul Mullowney     try {
7729ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
7739ae82921SPaul Mullowney 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
7749ae82921SPaul Mullowney       }
7759ae82921SPaul Mullowney       if (cusparseMat->tempvec!=0)
7769ae82921SPaul Mullowney 	delete cusparseMat->tempvec;
7779ae82921SPaul Mullowney       delete cusparseMat;
7789ae82921SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
7799ae82921SPaul Mullowney     } catch(char* ex) {
7809ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7819ae82921SPaul Mullowney     }
7829ae82921SPaul Mullowney   } else {
783e057df02SPaul Mullowney     /* The triangular factors */
7849ae82921SPaul Mullowney     try {
7859ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7869ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
7879ae82921SPaul Mullowney       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
7889ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatLo;
7899ae82921SPaul Mullowney       delete (GPU_Matrix_Ifc *)cusparseMatUp;
7909ae82921SPaul Mullowney       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
7919ae82921SPaul Mullowney       delete cusparseTriFactors;
7929ae82921SPaul Mullowney     } catch(char* ex) {
7939ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7949ae82921SPaul Mullowney     }
7959ae82921SPaul Mullowney   }
7969ae82921SPaul Mullowney   if (MAT_cusparseHandle) {
7979ae82921SPaul Mullowney     cusparseStatus_t stat;
7989ae82921SPaul Mullowney     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
7999ae82921SPaul Mullowney     MAT_cusparseHandle=0;
8009ae82921SPaul Mullowney   }
8019ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
8029ae82921SPaul Mullowney   A->spptr = 0;
8039ae82921SPaul Mullowney 
8049ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
8059ae82921SPaul Mullowney   PetscFunctionReturn(0);
8069ae82921SPaul Mullowney }
8079ae82921SPaul Mullowney 
8089ae82921SPaul Mullowney EXTERN_C_BEGIN
8099ae82921SPaul Mullowney #undef __FUNCT__
8109ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
8119ae82921SPaul Mullowney PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
8129ae82921SPaul Mullowney {
8139ae82921SPaul Mullowney   PetscErrorCode ierr;
8149ae82921SPaul Mullowney 
8159ae82921SPaul Mullowney   PetscFunctionBegin;
8169ae82921SPaul Mullowney   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
8179ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
818e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
819e057df02SPaul Mullowney        now build a GPU matrix data structure */
8209ae82921SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSE;
8219ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
8229ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
823e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR;
8249ae82921SPaul Mullowney   } else {
8259ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
826debe9ee2SPaul Mullowney     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
8279ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
8289ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
8299ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
830e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat;
8319ae82921SPaul Mullowney   }
832e057df02SPaul Mullowney   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
8339ae82921SPaul Mullowney   if (!MAT_cusparseHandle) {
8349ae82921SPaul Mullowney     cusparseStatus_t stat;
8359ae82921SPaul Mullowney     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
8369ae82921SPaul Mullowney   }
837e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
838e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
839e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
8409ae82921SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
8419ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
8429ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
8439ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
8449ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
845ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
846ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
847ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
848ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
8499ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8509ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
851e057df02SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
8529ae82921SPaul Mullowney   PetscFunctionReturn(0);
8539ae82921SPaul Mullowney }
8549ae82921SPaul Mullowney EXTERN_C_END
8559ae82921SPaul Mullowney 
856e057df02SPaul Mullowney /*M
857e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
858e057df02SPaul Mullowney 
859e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
860e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
861e057df02SPaul Mullowney    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
862e057df02SPaul Mullowney    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
863e057df02SPaul Mullowney    the different GPU storage formats.
864e057df02SPaul Mullowney 
865e057df02SPaul Mullowney    Options Database Keys:
866e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
867e057df02SPaul Mullowney .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
868e057df02SPaul Mullowney .  -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
869e057df02SPaul Mullowney -  -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
870e057df02SPaul Mullowney 
871e057df02SPaul Mullowney   Level: beginner
872e057df02SPaul Mullowney 
873e057df02SPaul Mullowney .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
874e057df02SPaul Mullowney M*/
875