xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4e4bbfaa3814cc83b5851d85be69070845f5653e)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15bc3f50f2SPaul Mullowney 
16e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
179ae82921SPaul Mullowney 
18087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21087f3262SPaul Mullowney 
226fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
236fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
246fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25087f3262SPaul Mullowney 
266fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
304416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
316fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
326fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
336fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
346fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
359ae82921SPaul Mullowney 
367f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
37470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
38470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
39470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
40470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
417f756511SDominic Meiser 
42b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43b06137fdSPaul Mullowney {
44b06137fdSPaul Mullowney   cusparseStatus_t   stat;
45b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46b06137fdSPaul Mullowney 
47b06137fdSPaul Mullowney   PetscFunctionBegin;
48b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
49c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
50b06137fdSPaul Mullowney   PetscFunctionReturn(0);
51b06137fdSPaul Mullowney }
52b06137fdSPaul Mullowney 
53b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
54b06137fdSPaul Mullowney {
55b06137fdSPaul Mullowney   cusparseStatus_t   stat;
56b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
57b06137fdSPaul Mullowney 
58b06137fdSPaul Mullowney   PetscFunctionBegin;
596b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
6016a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
61c41cb2e2SAlejandro Lamas Daviña       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
6216a2e217SAlejandro Lamas Daviña     }
63b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
646b1cf21dSAlejandro Lamas Daviña   }
65c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
66b06137fdSPaul Mullowney   PetscFunctionReturn(0);
67b06137fdSPaul Mullowney }
68b06137fdSPaul Mullowney 
69b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
70b06137fdSPaul Mullowney {
71b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72b06137fdSPaul Mullowney   PetscFunctionBegin;
73b06137fdSPaul Mullowney   if (cusparsestruct->handle)
74b06137fdSPaul Mullowney     cusparsestruct->handle = 0;
75b06137fdSPaul Mullowney   PetscFunctionReturn(0);
76b06137fdSPaul Mullowney }
77b06137fdSPaul Mullowney 
78ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
799ae82921SPaul Mullowney {
809ae82921SPaul Mullowney   PetscFunctionBegin;
819ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
829ae82921SPaul Mullowney   PetscFunctionReturn(0);
839ae82921SPaul Mullowney }
849ae82921SPaul Mullowney 
85c708e6cdSJed Brown /*MC
86087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
87087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
88087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
89087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
90087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
91087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
92c708e6cdSJed Brown 
939ae82921SPaul Mullowney   Level: beginner
94c708e6cdSJed Brown 
953ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
96c708e6cdSJed Brown M*/
979ae82921SPaul Mullowney 
9842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
999ae82921SPaul Mullowney {
1009ae82921SPaul Mullowney   PetscErrorCode ierr;
101bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1029ae82921SPaul Mullowney 
1039ae82921SPaul Mullowney   PetscFunctionBegin;
104bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
105404133a2SPaul Mullowney   (*B)->factortype = ftype;
106bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1079ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1082205254eSKarl Rupp 
109087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
11033d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1119ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1129ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
113087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
114087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
115087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1169ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117bc3f50f2SPaul Mullowney 
118fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1193ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1209ae82921SPaul Mullowney   PetscFunctionReturn(0);
1219ae82921SPaul Mullowney }
1229ae82921SPaul Mullowney 
123bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124ca45077fSPaul Mullowney {
125aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1266e111a19SKarl Rupp 
127ca45077fSPaul Mullowney   PetscFunctionBegin;
128ca45077fSPaul Mullowney   switch (op) {
129e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
130aa372e3fSPaul Mullowney     cusparsestruct->format = format;
131ca45077fSPaul Mullowney     break;
132e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
133aa372e3fSPaul Mullowney     cusparsestruct->format = format;
134ca45077fSPaul Mullowney     break;
135ca45077fSPaul Mullowney   default:
13636d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137ca45077fSPaul Mullowney   }
138ca45077fSPaul Mullowney   PetscFunctionReturn(0);
139ca45077fSPaul Mullowney }
1409ae82921SPaul Mullowney 
141e057df02SPaul Mullowney /*@
142e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
143e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
144aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
145e057df02SPaul Mullowney    Not Collective
146e057df02SPaul Mullowney 
147e057df02SPaul Mullowney    Input Parameters:
1488468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
14936d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1502692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
151e057df02SPaul Mullowney 
152e057df02SPaul Mullowney    Output Parameter:
153e057df02SPaul Mullowney 
154e057df02SPaul Mullowney    Level: intermediate
155e057df02SPaul Mullowney 
1568468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157e057df02SPaul Mullowney @*/
158e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
159e057df02SPaul Mullowney {
160e057df02SPaul Mullowney   PetscErrorCode ierr;
1616e111a19SKarl Rupp 
162e057df02SPaul Mullowney   PetscFunctionBegin;
163e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
164e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
165e057df02SPaul Mullowney   PetscFunctionReturn(0);
166e057df02SPaul Mullowney }
167e057df02SPaul Mullowney 
1684416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
1699ae82921SPaul Mullowney {
1709ae82921SPaul Mullowney   PetscErrorCode           ierr;
171e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1729ae82921SPaul Mullowney   PetscBool                flg;
173a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1746e111a19SKarl Rupp 
1759ae82921SPaul Mullowney   PetscFunctionBegin;
176e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
1779ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
178e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
179a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
180e057df02SPaul Mullowney     if (flg) {
181e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
182045c96e1SPaul Mullowney     }
1839ae82921SPaul Mullowney   }
1844c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
185a183c035SDominic Meiser                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1864c87dfd4SPaul Mullowney   if (flg) {
1874c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1884c87dfd4SPaul Mullowney   }
1890af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
1909ae82921SPaul Mullowney   PetscFunctionReturn(0);
1919ae82921SPaul Mullowney 
1929ae82921SPaul Mullowney }
1939ae82921SPaul Mullowney 
1946fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1959ae82921SPaul Mullowney {
1969ae82921SPaul Mullowney   PetscErrorCode ierr;
1979ae82921SPaul Mullowney 
1989ae82921SPaul Mullowney   PetscFunctionBegin;
1999ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2009ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2019ae82921SPaul Mullowney   PetscFunctionReturn(0);
2029ae82921SPaul Mullowney }
2039ae82921SPaul Mullowney 
2046fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2059ae82921SPaul Mullowney {
2069ae82921SPaul Mullowney   PetscErrorCode ierr;
2079ae82921SPaul Mullowney 
2089ae82921SPaul Mullowney   PetscFunctionBegin;
2099ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2109ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2119ae82921SPaul Mullowney   PetscFunctionReturn(0);
2129ae82921SPaul Mullowney }
2139ae82921SPaul Mullowney 
214087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
215087f3262SPaul Mullowney {
216087f3262SPaul Mullowney   PetscErrorCode ierr;
217087f3262SPaul Mullowney 
218087f3262SPaul Mullowney   PetscFunctionBegin;
219087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
220087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
221087f3262SPaul Mullowney   PetscFunctionReturn(0);
222087f3262SPaul Mullowney }
223087f3262SPaul Mullowney 
224087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
225087f3262SPaul Mullowney {
226087f3262SPaul Mullowney   PetscErrorCode ierr;
227087f3262SPaul Mullowney 
228087f3262SPaul Mullowney   PetscFunctionBegin;
229087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
230087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
231087f3262SPaul Mullowney   PetscFunctionReturn(0);
232087f3262SPaul Mullowney }
233087f3262SPaul Mullowney 
234087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2359ae82921SPaul Mullowney {
2369ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2379ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2389ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
239aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2409ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2419ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2429ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2439ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2449ae82921SPaul Mullowney   PetscScalar                       *AALo;
2459ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
246b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
2479ae82921SPaul Mullowney 
2489ae82921SPaul Mullowney   PetscFunctionBegin;
249cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
250b8ced49eSKarl Rupp   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
2519ae82921SPaul Mullowney     try {
2529ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2539ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2549ae82921SPaul Mullowney 
2559ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
256c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
257c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
258c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
2599ae82921SPaul Mullowney 
2609ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2619ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2629ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2639ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2649ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2659ae82921SPaul Mullowney       v        = aa;
2669ae82921SPaul Mullowney       vi       = aj;
2679ae82921SPaul Mullowney       offset   = 1;
2689ae82921SPaul Mullowney       rowOffset= 1;
2699ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2709ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
271e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2729ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2739ae82921SPaul Mullowney         rowOffset += nz+1;
2749ae82921SPaul Mullowney 
275580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
276580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
2779ae82921SPaul Mullowney 
2789ae82921SPaul Mullowney         offset      += nz;
2799ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2809ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2819ae82921SPaul Mullowney         offset      += 1;
2829ae82921SPaul Mullowney 
2839ae82921SPaul Mullowney         v  += nz;
2849ae82921SPaul Mullowney         vi += nz;
2859ae82921SPaul Mullowney       }
2862205254eSKarl Rupp 
287aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
288aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
2892205254eSKarl Rupp 
290aa372e3fSPaul Mullowney       /* Create the matrix description */
291c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
292c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
293c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
294c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
295c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
296aa372e3fSPaul Mullowney 
297aa372e3fSPaul Mullowney       /* Create the solve analysis information */
298c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
299aa372e3fSPaul Mullowney 
300aa372e3fSPaul Mullowney       /* set the operation */
301aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
302aa372e3fSPaul Mullowney 
303aa372e3fSPaul Mullowney       /* set the matrix */
304aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
305aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
306aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
307aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
308aa372e3fSPaul Mullowney 
309aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
310aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
311aa372e3fSPaul Mullowney 
312aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
313aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
314aa372e3fSPaul Mullowney 
315aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
316aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
317aa372e3fSPaul Mullowney 
318aa372e3fSPaul Mullowney       /* perform the solve analysis */
319aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
320aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
321aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
322c41cb2e2SAlejandro Lamas Daviña                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
323aa372e3fSPaul Mullowney 
324aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
325aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3262205254eSKarl Rupp 
327c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
328c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
329c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
3304863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
3319ae82921SPaul Mullowney     } catch(char *ex) {
3329ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3339ae82921SPaul Mullowney     }
3349ae82921SPaul Mullowney   }
3359ae82921SPaul Mullowney   PetscFunctionReturn(0);
3369ae82921SPaul Mullowney }
3379ae82921SPaul Mullowney 
338087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3399ae82921SPaul Mullowney {
3409ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3419ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3429ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
343aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3449ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3459ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3469ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3479ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3489ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3499ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3509ae82921SPaul Mullowney   PetscErrorCode                    ierr;
3519ae82921SPaul Mullowney 
3529ae82921SPaul Mullowney   PetscFunctionBegin;
353cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
354b8ced49eSKarl Rupp   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
3559ae82921SPaul Mullowney     try {
3569ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3579ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3589ae82921SPaul Mullowney 
3599ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
360c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
3639ae82921SPaul Mullowney 
3649ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3659ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3669ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3679ae82921SPaul Mullowney       offset = nzUpper;
3689ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3699ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3709ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3719ae82921SPaul Mullowney 
372e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3739ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3749ae82921SPaul Mullowney 
375e057df02SPaul Mullowney         /* decrement the offset */
3769ae82921SPaul Mullowney         offset -= (nz+1);
3779ae82921SPaul Mullowney 
378e057df02SPaul Mullowney         /* first, set the diagonal elements */
3799ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
38009f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
3819ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3829ae82921SPaul Mullowney 
383580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
384580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
3859ae82921SPaul Mullowney       }
3862205254eSKarl Rupp 
387aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
388aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3892205254eSKarl Rupp 
390aa372e3fSPaul Mullowney       /* Create the matrix description */
391c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
396aa372e3fSPaul Mullowney 
397aa372e3fSPaul Mullowney       /* Create the solve analysis information */
398c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
399aa372e3fSPaul Mullowney 
400aa372e3fSPaul Mullowney       /* set the operation */
401aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402aa372e3fSPaul Mullowney 
403aa372e3fSPaul Mullowney       /* set the matrix */
404aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
405aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
406aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
407aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
408aa372e3fSPaul Mullowney 
409aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
411aa372e3fSPaul Mullowney 
412aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
414aa372e3fSPaul Mullowney 
415aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
417aa372e3fSPaul Mullowney 
418aa372e3fSPaul Mullowney       /* perform the solve analysis */
419aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422c41cb2e2SAlejandro Lamas Daviña                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
423aa372e3fSPaul Mullowney 
424aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
425aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4262205254eSKarl Rupp 
427c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
4304863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
4319ae82921SPaul Mullowney     } catch(char *ex) {
4329ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4339ae82921SPaul Mullowney     }
4349ae82921SPaul Mullowney   }
4359ae82921SPaul Mullowney   PetscFunctionReturn(0);
4369ae82921SPaul Mullowney }
4379ae82921SPaul Mullowney 
438087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4399ae82921SPaul Mullowney {
4409ae82921SPaul Mullowney   PetscErrorCode               ierr;
4419ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4429ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4439ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4449ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4459ae82921SPaul Mullowney   const PetscInt               *r,*c;
4469ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4479ae82921SPaul Mullowney 
4489ae82921SPaul Mullowney   PetscFunctionBegin;
449087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
450087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4512205254eSKarl Rupp 
452e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
453aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
4549ae82921SPaul Mullowney 
455b8ced49eSKarl Rupp   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
456e057df02SPaul Mullowney   /* lower triangular indices */
4579ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4589ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4592205254eSKarl Rupp   if (!row_identity) {
460aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
461aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
4622205254eSKarl Rupp   }
4639ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4649ae82921SPaul Mullowney 
465e057df02SPaul Mullowney   /* upper triangular indices */
4669ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
4679ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4682205254eSKarl Rupp   if (!col_identity) {
469aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
470aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
4712205254eSKarl Rupp   }
4724863603aSSatish Balay 
4734863603aSSatish Balay   if (!row_identity && !col_identity) {
4744863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
4754863603aSSatish Balay   } else if(!row_identity) {
4764863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4774863603aSSatish Balay   } else if(!col_identity) {
4784863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4794863603aSSatish Balay   }
4804863603aSSatish Balay 
4819ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
4829ae82921SPaul Mullowney   PetscFunctionReturn(0);
4839ae82921SPaul Mullowney }
4849ae82921SPaul Mullowney 
485087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
486087f3262SPaul Mullowney {
487087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
488087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
490aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
491087f3262SPaul Mullowney   cusparseStatus_t                  stat;
492087f3262SPaul Mullowney   PetscErrorCode                    ierr;
493087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
494087f3262SPaul Mullowney   PetscScalar                       *AAUp;
495087f3262SPaul Mullowney   PetscScalar                       *AALo;
496087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
497087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
498087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
499087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
500087f3262SPaul Mullowney 
501087f3262SPaul Mullowney   PetscFunctionBegin;
502cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
503b8ced49eSKarl Rupp   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504087f3262SPaul Mullowney     try {
505087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
506c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509c41cb2e2SAlejandro Lamas Daviña       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510087f3262SPaul Mullowney 
511087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
512087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
513087f3262SPaul Mullowney       AiUp[n]=nzUpper;
514087f3262SPaul Mullowney       offset = 0;
515087f3262SPaul Mullowney       for (i=0; i<n; i++) {
516087f3262SPaul Mullowney         /* set the pointers */
517087f3262SPaul Mullowney         v  = aa + ai[i];
518087f3262SPaul Mullowney         vj = aj + ai[i];
519087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520087f3262SPaul Mullowney 
521087f3262SPaul Mullowney         /* first, set the diagonal elements */
522087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
52309f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
524087f3262SPaul Mullowney         AiUp[i]      = offset;
52509f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
526087f3262SPaul Mullowney 
527087f3262SPaul Mullowney         offset+=1;
528087f3262SPaul Mullowney         if (nz>0) {
529f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
532087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
533087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
534087f3262SPaul Mullowney           }
535087f3262SPaul Mullowney           offset+=nz;
536087f3262SPaul Mullowney         }
537087f3262SPaul Mullowney       }
538087f3262SPaul Mullowney 
539aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
540aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541087f3262SPaul Mullowney 
542aa372e3fSPaul Mullowney       /* Create the matrix description */
543c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548087f3262SPaul Mullowney 
549aa372e3fSPaul Mullowney       /* Create the solve analysis information */
550c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551aa372e3fSPaul Mullowney 
552aa372e3fSPaul Mullowney       /* set the operation */
553aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554aa372e3fSPaul Mullowney 
555aa372e3fSPaul Mullowney       /* set the matrix */
556aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
557aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
558aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
559aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
560aa372e3fSPaul Mullowney 
561aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563aa372e3fSPaul Mullowney 
564aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566aa372e3fSPaul Mullowney 
567aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569aa372e3fSPaul Mullowney 
570aa372e3fSPaul Mullowney       /* perform the solve analysis */
571aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574c41cb2e2SAlejandro Lamas Daviña                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575aa372e3fSPaul Mullowney 
576aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
577aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578aa372e3fSPaul Mullowney 
579aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
580aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581aa372e3fSPaul Mullowney 
582aa372e3fSPaul Mullowney       /* Create the matrix description */
583c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587c41cb2e2SAlejandro Lamas Daviña       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588aa372e3fSPaul Mullowney 
589aa372e3fSPaul Mullowney       /* Create the solve analysis information */
590c41cb2e2SAlejandro Lamas Daviña       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591aa372e3fSPaul Mullowney 
592aa372e3fSPaul Mullowney       /* set the operation */
593aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594aa372e3fSPaul Mullowney 
595aa372e3fSPaul Mullowney       /* set the matrix */
596aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
597aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
598aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
599aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
600aa372e3fSPaul Mullowney 
601aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603aa372e3fSPaul Mullowney 
604aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606aa372e3fSPaul Mullowney 
607aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
6094863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610aa372e3fSPaul Mullowney 
611aa372e3fSPaul Mullowney       /* perform the solve analysis */
612aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615c41cb2e2SAlejandro Lamas Daviña                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616aa372e3fSPaul Mullowney 
617aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
618aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619087f3262SPaul Mullowney 
620b8ced49eSKarl Rupp       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624c41cb2e2SAlejandro Lamas Daviña       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625087f3262SPaul Mullowney     } catch(char *ex) {
626087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627087f3262SPaul Mullowney     }
628087f3262SPaul Mullowney   }
629087f3262SPaul Mullowney   PetscFunctionReturn(0);
630087f3262SPaul Mullowney }
631087f3262SPaul Mullowney 
632087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6339ae82921SPaul Mullowney {
6349ae82921SPaul Mullowney   PetscErrorCode               ierr;
635087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637087f3262SPaul Mullowney   IS                           ip = a->row;
638087f3262SPaul Mullowney   const PetscInt               *rip;
639087f3262SPaul Mullowney   PetscBool                    perm_identity;
640087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
641087f3262SPaul Mullowney 
642087f3262SPaul Mullowney   PetscFunctionBegin;
643087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646aa372e3fSPaul Mullowney 
647087f3262SPaul Mullowney   /* lower triangular indices */
648087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650087f3262SPaul Mullowney   if (!perm_identity) {
651*4e4bbfaaSStefano Zampini     IS             iip;
652*4e4bbfaaSStefano Zampini     const PetscInt *irip;
653*4e4bbfaaSStefano Zampini 
654*4e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
655*4e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
656aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
657aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
658aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
659*4e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
660*4e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
661*4e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
6624863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
663087f3262SPaul Mullowney   }
664087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
665087f3262SPaul Mullowney   PetscFunctionReturn(0);
666087f3262SPaul Mullowney }
667087f3262SPaul Mullowney 
6686fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
6699ae82921SPaul Mullowney {
6709ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
6719ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
6729ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
673b175d8bbSPaul Mullowney   PetscErrorCode ierr;
6749ae82921SPaul Mullowney 
6759ae82921SPaul Mullowney   PetscFunctionBegin;
6769ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
677e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
6789ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6799ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
680bda325fcSPaul Mullowney   if (row_identity && col_identity) {
681bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
682bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
683*4e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
684*4e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
685bda325fcSPaul Mullowney   } else {
686bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
687bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
688*4e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
689*4e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
690bda325fcSPaul Mullowney   }
6918dc1d2a3SPaul Mullowney 
692e057df02SPaul Mullowney   /* get the triangular factors */
693087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
6949ae82921SPaul Mullowney   PetscFunctionReturn(0);
6959ae82921SPaul Mullowney }
6969ae82921SPaul Mullowney 
697087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
698087f3262SPaul Mullowney {
699087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
700087f3262SPaul Mullowney   IS             ip = b->row;
701087f3262SPaul Mullowney   PetscBool      perm_identity;
702b175d8bbSPaul Mullowney   PetscErrorCode ierr;
703087f3262SPaul Mullowney 
704087f3262SPaul Mullowney   PetscFunctionBegin;
705087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
706087f3262SPaul Mullowney 
707087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
708087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
709087f3262SPaul Mullowney   if (perm_identity) {
710087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
711087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
712*4e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
713*4e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
714087f3262SPaul Mullowney   } else {
715087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
716087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
717*4e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
718*4e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
719087f3262SPaul Mullowney   }
720087f3262SPaul Mullowney 
721087f3262SPaul Mullowney   /* get the triangular factors */
722087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
723087f3262SPaul Mullowney   PetscFunctionReturn(0);
724087f3262SPaul Mullowney }
7259ae82921SPaul Mullowney 
726b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
727bda325fcSPaul Mullowney {
728bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
729aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
730aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
731aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
732aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
733bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
734aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
735aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
736aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
737aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
738b175d8bbSPaul Mullowney 
739bda325fcSPaul Mullowney   PetscFunctionBegin;
740bda325fcSPaul Mullowney 
741aa372e3fSPaul Mullowney   /*********************************************/
742aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
743aa372e3fSPaul Mullowney   /*********************************************/
744aa372e3fSPaul Mullowney 
745aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
746aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
747aa372e3fSPaul Mullowney 
748aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
749aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
750aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
751aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
752aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
753aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
754aa372e3fSPaul Mullowney 
755aa372e3fSPaul Mullowney   /* Create the matrix description */
756c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
757c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
758c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
759c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
760c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
761aa372e3fSPaul Mullowney 
762aa372e3fSPaul Mullowney   /* Create the solve analysis information */
763c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
764aa372e3fSPaul Mullowney 
765aa372e3fSPaul Mullowney   /* set the operation */
766aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
767aa372e3fSPaul Mullowney 
768aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
769aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
770aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
771aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
772aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
773aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
774aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
775aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
776aa372e3fSPaul Mullowney 
777aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
778aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
779aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
780aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
781aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
782aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
783aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
784aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
785aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
786c41cb2e2SAlejandro Lamas Daviña                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
787aa372e3fSPaul Mullowney 
788aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
789aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
790aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
791aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
792aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
793c41cb2e2SAlejandro Lamas Daviña                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
794aa372e3fSPaul Mullowney 
795aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
796aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
797aa372e3fSPaul Mullowney 
798aa372e3fSPaul Mullowney   /*********************************************/
799aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
800aa372e3fSPaul Mullowney   /*********************************************/
801aa372e3fSPaul Mullowney 
802aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
803aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
804aa372e3fSPaul Mullowney 
805aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
806aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
807aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
808aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
809aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
810aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
811aa372e3fSPaul Mullowney 
812aa372e3fSPaul Mullowney   /* Create the matrix description */
813c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
814c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
815c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
816c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
817c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney   /* Create the solve analysis information */
820c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
821aa372e3fSPaul Mullowney 
822aa372e3fSPaul Mullowney   /* set the operation */
823aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
824aa372e3fSPaul Mullowney 
825aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
826aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
827aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
828aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
829aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
830aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
831aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
832aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
833aa372e3fSPaul Mullowney 
834aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
835aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
836aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
837aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
838aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
839aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
840aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
841aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
842aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
843c41cb2e2SAlejandro Lamas Daviña                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
846aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
847aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
848aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
849aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
850c41cb2e2SAlejandro Lamas Daviña                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
851aa372e3fSPaul Mullowney 
852aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
853aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
854bda325fcSPaul Mullowney   PetscFunctionReturn(0);
855bda325fcSPaul Mullowney }
856bda325fcSPaul Mullowney 
857b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
858bda325fcSPaul Mullowney {
859aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
860aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
861aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
862bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
863bda325fcSPaul Mullowney   cusparseStatus_t             stat;
864aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
865b06137fdSPaul Mullowney   cudaError_t                  err;
8664863603aSSatish Balay   PetscErrorCode               ierr;
867b175d8bbSPaul Mullowney 
868bda325fcSPaul Mullowney   PetscFunctionBegin;
869aa372e3fSPaul Mullowney 
870aa372e3fSPaul Mullowney   /* allocate space for the triangular factor information */
871aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
872c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
873aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
874c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
875c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
876aa372e3fSPaul Mullowney 
877b06137fdSPaul Mullowney   /* set alpha and beta */
878c41cb2e2SAlejandro Lamas Daviña   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
8797656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
8807656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
8817656d835SStefano Zampini   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
8827656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
8837656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884c41cb2e2SAlejandro Lamas Daviña   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
885b06137fdSPaul Mullowney 
886aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
887aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
888aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
889a3fdcf43SKarl Rupp     thrust::device_vector<int> *rowoffsets_gpu;
890a3fdcf43SKarl Rupp     const int ncomprow = matstruct->cprowIndices->size();
891a3fdcf43SKarl Rupp     thrust::host_vector<int> cprow(ncomprow);
892a3fdcf43SKarl Rupp     cprow = *matstruct->cprowIndices; // GPU --> CPU
893554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
894554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
895aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
896a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
897aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
899a3fdcf43SKarl Rupp     PetscInt k,i;
900a3fdcf43SKarl Rupp     thrust::host_vector<int> rowst_host(A->rmap->n+1);
901a3fdcf43SKarl Rupp 
902a3fdcf43SKarl Rupp     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
903a3fdcf43SKarl Rupp     rowst_host[0] = 0;
904a3fdcf43SKarl Rupp     for (k = 0, i = 0; i < A->rmap->n ; i++) {
905a3fdcf43SKarl Rupp       if (k < ncomprow && i==cprow[k]) {
906a3fdcf43SKarl Rupp 	rowst_host[i+1] = a->i[i+1];
907a3fdcf43SKarl Rupp 	k++;
908a3fdcf43SKarl Rupp       } else rowst_host[i+1] = rowst_host[i];
909a3fdcf43SKarl Rupp     }
910a3fdcf43SKarl Rupp     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
911a3fdcf43SKarl Rupp     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
912a3fdcf43SKarl Rupp     *rowoffsets_gpu = rowst_host; // CPU --> GPU
913aa372e3fSPaul Mullowney 
914aa372e3fSPaul Mullowney     /* compute the transpose of the upper triangular factor, i.e. the CSC */
915aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
916a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
917a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
918aa372e3fSPaul Mullowney                             matrix->values->data().get(),
919a3fdcf43SKarl Rupp                             rowoffsets_gpu->data().get(),
920aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
921aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
922aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
923aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
924c41cb2e2SAlejandro Lamas Daviña                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
925aa372e3fSPaul Mullowney 
926aa372e3fSPaul Mullowney     /* assign the pointer */
927aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
9284863603aSSatish Balay     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
929aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
930aa372e3fSPaul Mullowney     /* First convert HYB to CSR */
931aa372e3fSPaul Mullowney     CsrMatrix *temp= new CsrMatrix;
932aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
933aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
934aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
935aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
936aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
937aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
938aa372e3fSPaul Mullowney 
9392692e278SPaul Mullowney 
940aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
941aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
942aa372e3fSPaul Mullowney                             temp->values->data().get(),
943aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
944c41cb2e2SAlejandro Lamas Daviña                             temp->column_indices->data().get());CHKERRCUDA(stat);
945aa372e3fSPaul Mullowney 
946aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
947aa372e3fSPaul Mullowney     CsrMatrix *tempT= new CsrMatrix;
948aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
949aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
950aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
951aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
952aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
953aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
954aa372e3fSPaul Mullowney 
955aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
956aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
957aa372e3fSPaul Mullowney                             temp->values->data().get(),
958aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
959aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
960aa372e3fSPaul Mullowney                             tempT->values->data().get(),
961aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
962aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
963c41cb2e2SAlejandro Lamas Daviña                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
964aa372e3fSPaul Mullowney 
965aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
966aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
967c41cb2e2SAlejandro Lamas Daviña     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
968aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
969aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
970aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
971aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
972aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
973aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
974c41cb2e2SAlejandro Lamas Daviña                             hybMat, 0, partition);CHKERRCUDA(stat);
975aa372e3fSPaul Mullowney 
976aa372e3fSPaul Mullowney     /* assign the pointer */
977aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
9784863603aSSatish Balay     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
979aa372e3fSPaul Mullowney 
980aa372e3fSPaul Mullowney     /* delete temporaries */
981aa372e3fSPaul Mullowney     if (tempT) {
982aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
983aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
984aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
985aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
986087f3262SPaul Mullowney     }
987aa372e3fSPaul Mullowney     if (temp) {
988aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
989aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
990aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
991aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
992aa372e3fSPaul Mullowney     }
993aa372e3fSPaul Mullowney   }
994aa372e3fSPaul Mullowney   /* assign the compressed row indices */
995aa372e3fSPaul Mullowney   matstructT->cprowIndices = new THRUSTINTARRAY;
996554b8892SKarl Rupp   matstructT->cprowIndices->resize(A->cmap->n);
997554b8892SKarl Rupp   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
998aa372e3fSPaul Mullowney   /* assign the pointer */
999aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1000bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1001bda325fcSPaul Mullowney }
1002bda325fcSPaul Mullowney 
1003*4e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
10046fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1005bda325fcSPaul Mullowney {
1006c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1007465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1008465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1009465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1010465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1011bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1012bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1013aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1014aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1015aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1016b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1017bda325fcSPaul Mullowney 
1018bda325fcSPaul Mullowney   PetscFunctionBegin;
1019aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1020aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1021bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1022aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1023aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1024bda325fcSPaul Mullowney   }
1025bda325fcSPaul Mullowney 
1026bda325fcSPaul Mullowney   /* Get the GPU pointers */
1027c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1028c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1029c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1030c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1031bda325fcSPaul Mullowney 
10327a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1033aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1034c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1035c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1036c41cb2e2SAlejandro Lamas Daviña                xGPU);
1037aa372e3fSPaul Mullowney 
1038aa372e3fSPaul Mullowney   /* First, solve U */
1039aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
10407656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1041aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1042aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1043aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1044aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1045c41cb2e2SAlejandro Lamas Daviña                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1046aa372e3fSPaul Mullowney 
1047aa372e3fSPaul Mullowney   /* Then, solve L */
1048aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
10497656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1050aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1051aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1052aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1053aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1054c41cb2e2SAlejandro Lamas Daviña                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1055aa372e3fSPaul Mullowney 
1056aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1057c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1058c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1059aa372e3fSPaul Mullowney                tempGPU->begin());
1060aa372e3fSPaul Mullowney 
1061aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1062c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1063bda325fcSPaul Mullowney 
1064bda325fcSPaul Mullowney   /* restore */
1065c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1066c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1067c41cb2e2SAlejandro Lamas Daviña   ierr = WaitForGPU();CHKERRCUDA(ierr);
1068661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1069958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1070bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1071bda325fcSPaul Mullowney }
1072bda325fcSPaul Mullowney 
10736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1074bda325fcSPaul Mullowney {
1075465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1076465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1077bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1078bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1079aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1080aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1081aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1082b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1083bda325fcSPaul Mullowney 
1084bda325fcSPaul Mullowney   PetscFunctionBegin;
1085aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1086aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1087bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1088aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1089aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1090bda325fcSPaul Mullowney   }
1091bda325fcSPaul Mullowney 
1092bda325fcSPaul Mullowney   /* Get the GPU pointers */
1093c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1094c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1095bda325fcSPaul Mullowney 
10967a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1097aa372e3fSPaul Mullowney   /* First, solve U */
1098aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
10997656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1100aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1101aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1102aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1103aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1104c41cb2e2SAlejandro Lamas Daviña                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1105aa372e3fSPaul Mullowney 
1106aa372e3fSPaul Mullowney   /* Then, solve L */
1107aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
11087656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1109aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1110aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1111aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1112aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1113c41cb2e2SAlejandro Lamas Daviña                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1114bda325fcSPaul Mullowney 
1115bda325fcSPaul Mullowney   /* restore */
1116c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1117c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1118c41cb2e2SAlejandro Lamas Daviña   ierr = WaitForGPU();CHKERRCUDA(ierr);
1119661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1120958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1121bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1122bda325fcSPaul Mullowney }
1123bda325fcSPaul Mullowney 
11246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
11259ae82921SPaul Mullowney {
1126465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1127465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1128465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1129465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
11309ae82921SPaul Mullowney   cusparseStatus_t                      stat;
11319ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1132aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1133aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1134aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1135b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
11369ae82921SPaul Mullowney 
11379ae82921SPaul Mullowney   PetscFunctionBegin;
1138ebc8f436SDominic Meiser 
1139e057df02SPaul Mullowney   /* Get the GPU pointers */
1140c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1141c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1142c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1143c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
11449ae82921SPaul Mullowney 
11457a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1146aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1147c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1148c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1149*4e4bbfaaSStefano Zampini                tempGPU->begin());
1150aa372e3fSPaul Mullowney 
1151aa372e3fSPaul Mullowney   /* Next, solve L */
1152aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
11537656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1154aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1155aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1156aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1157aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1158*4e4bbfaaSStefano Zampini                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1159aa372e3fSPaul Mullowney 
1160aa372e3fSPaul Mullowney   /* Then, solve U */
1161aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
11627656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1163aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1164aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1165aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1166aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1167*4e4bbfaaSStefano Zampini                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1168aa372e3fSPaul Mullowney 
1169*4e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1170*4e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1171*4e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1172*4e4bbfaaSStefano Zampini                xGPU);
11739ae82921SPaul Mullowney 
1174c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1175c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1176c41cb2e2SAlejandro Lamas Daviña   ierr = WaitForGPU();CHKERRCUDA(ierr);
1177661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1178958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
11799ae82921SPaul Mullowney   PetscFunctionReturn(0);
11809ae82921SPaul Mullowney }
11819ae82921SPaul Mullowney 
11826fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
11839ae82921SPaul Mullowney {
1184465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1185465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
11869ae82921SPaul Mullowney   cusparseStatus_t                  stat;
11879ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1188aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1189aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1190aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1191b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
11929ae82921SPaul Mullowney 
11939ae82921SPaul Mullowney   PetscFunctionBegin;
1194e057df02SPaul Mullowney   /* Get the GPU pointers */
1195c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1196c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
11979ae82921SPaul Mullowney 
11987a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1199aa372e3fSPaul Mullowney   /* First, solve L */
1200aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
12017656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1202aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1203aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1204aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1205aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1206c41cb2e2SAlejandro Lamas Daviña                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1207aa372e3fSPaul Mullowney 
1208aa372e3fSPaul Mullowney   /* Next, solve U */
1209aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
12107656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1211aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1212aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1213aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1214aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1215c41cb2e2SAlejandro Lamas Daviña                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
12169ae82921SPaul Mullowney 
1217c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1218c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1219c41cb2e2SAlejandro Lamas Daviña   ierr = WaitForGPU();CHKERRCUDA(ierr);
1220661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1221958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12229ae82921SPaul Mullowney   PetscFunctionReturn(0);
12239ae82921SPaul Mullowney }
12249ae82921SPaul Mullowney 
12256fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
12269ae82921SPaul Mullowney {
1227aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1228aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
12299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
12309ae82921SPaul Mullowney   PetscInt                     m = A->rmap->n,*ii,*ridx;
12319ae82921SPaul Mullowney   PetscErrorCode               ierr;
1232aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1233b06137fdSPaul Mullowney   cudaError_t                  err;
12349ae82921SPaul Mullowney 
12359ae82921SPaul Mullowney   PetscFunctionBegin;
1236b8ced49eSKarl Rupp   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
12379ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
123834d6c7a5SJose E. Roman     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
123934d6c7a5SJose E. Roman       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
124034d6c7a5SJose E. Roman       /* copy values only */
124134d6c7a5SJose E. Roman       matrix->values->assign(a->a, a->a+a->nz);
12424863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
124334d6c7a5SJose E. Roman     } else {
1244470880abSPatrick Sanan       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
12459ae82921SPaul Mullowney       try {
1246aa372e3fSPaul Mullowney         cusparsestruct->nonzerorow=0;
1247aa372e3fSPaul Mullowney         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
12489ae82921SPaul Mullowney 
12499ae82921SPaul Mullowney         if (a->compressedrow.use) {
12509ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
12519ae82921SPaul Mullowney           ii   = a->compressedrow.i;
12529ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
12539ae82921SPaul Mullowney         } else {
1254b06137fdSPaul Mullowney           /* Forcing compressed row on the GPU */
12559ae82921SPaul Mullowney           int k=0;
1256854ce69bSBarry Smith           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1257854ce69bSBarry Smith           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
12589ae82921SPaul Mullowney           ii[0]=0;
12599ae82921SPaul Mullowney           for (int j = 0; j<m; j++) {
12609ae82921SPaul Mullowney             if ((a->i[j+1]-a->i[j])>0) {
12619ae82921SPaul Mullowney               ii[k]  = a->i[j];
12629ae82921SPaul Mullowney               ridx[k]= j;
12639ae82921SPaul Mullowney               k++;
12649ae82921SPaul Mullowney             }
12659ae82921SPaul Mullowney           }
1266aa372e3fSPaul Mullowney           ii[cusparsestruct->nonzerorow] = a->nz;
1267aa372e3fSPaul Mullowney           m = cusparsestruct->nonzerorow;
12689ae82921SPaul Mullowney         }
12699ae82921SPaul Mullowney 
1270aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
1271aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1272c41cb2e2SAlejandro Lamas Daviña         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1273c41cb2e2SAlejandro Lamas Daviña         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1274c41cb2e2SAlejandro Lamas Daviña         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
12759ae82921SPaul Mullowney 
1276c41cb2e2SAlejandro Lamas Daviña         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
12777656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12787656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
12797656d835SStefano Zampini         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12807656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12817656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1282c41cb2e2SAlejandro Lamas Daviña         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1283b06137fdSPaul Mullowney 
1284aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1285aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1286aa372e3fSPaul Mullowney           /* set the matrix */
1287aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1288a65300a6SPaul Mullowney           matrix->num_rows = m;
1289aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1290aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1291a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1292a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
12939ae82921SPaul Mullowney 
1294aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1295aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1296aa372e3fSPaul Mullowney 
1297aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1298aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1299aa372e3fSPaul Mullowney 
1300aa372e3fSPaul Mullowney           /* assign the pointer */
1301aa372e3fSPaul Mullowney           matstruct->mat = matrix;
1302aa372e3fSPaul Mullowney 
1303aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1304aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1305a65300a6SPaul Mullowney           matrix->num_rows = m;
1306aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1307aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1308a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1309a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
1310aa372e3fSPaul Mullowney 
1311aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1312aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1313aa372e3fSPaul Mullowney 
1314aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1315aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1316aa372e3fSPaul Mullowney 
1317aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
1318c41cb2e2SAlejandro Lamas Daviña           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1319aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1320aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1321a65300a6SPaul Mullowney           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1322aa372e3fSPaul Mullowney               matstruct->descr, matrix->values->data().get(),
1323aa372e3fSPaul Mullowney               matrix->row_offsets->data().get(),
1324aa372e3fSPaul Mullowney               matrix->column_indices->data().get(),
1325c41cb2e2SAlejandro Lamas Daviña               hybMat, 0, partition);CHKERRCUDA(stat);
1326aa372e3fSPaul Mullowney           /* assign the pointer */
1327aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1328aa372e3fSPaul Mullowney 
1329aa372e3fSPaul Mullowney           if (matrix) {
1330aa372e3fSPaul Mullowney             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1331aa372e3fSPaul Mullowney             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1332aa372e3fSPaul Mullowney             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1333aa372e3fSPaul Mullowney             delete (CsrMatrix*)matrix;
1334087f3262SPaul Mullowney           }
1335087f3262SPaul Mullowney         }
1336ca45077fSPaul Mullowney 
1337aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1338aa372e3fSPaul Mullowney         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1339aa372e3fSPaul Mullowney         matstruct->cprowIndices->assign(ridx,ridx+m);
13404863603aSSatish Balay         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1341aa372e3fSPaul Mullowney 
1342aa372e3fSPaul Mullowney         /* assign the pointer */
1343aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
1344aa372e3fSPaul Mullowney 
13459ae82921SPaul Mullowney         if (!a->compressedrow.use) {
13469ae82921SPaul Mullowney           ierr = PetscFree(ii);CHKERRQ(ierr);
13479ae82921SPaul Mullowney           ierr = PetscFree(ridx);CHKERRQ(ierr);
13489ae82921SPaul Mullowney         }
1349e65717acSKarl Rupp         cusparsestruct->workVector = new THRUSTARRAY(m);
13509ae82921SPaul Mullowney       } catch(char *ex) {
13519ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
13529ae82921SPaul Mullowney       }
135334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
135434d6c7a5SJose E. Roman     }
13557d0e52d8SJose E. Roman     ierr = WaitForGPU();CHKERRCUDA(ierr);
1356b8ced49eSKarl Rupp     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
13579ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
13589ae82921SPaul Mullowney   }
13599ae82921SPaul Mullowney   PetscFunctionReturn(0);
13609ae82921SPaul Mullowney }
13619ae82921SPaul Mullowney 
1362c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1363aa372e3fSPaul Mullowney {
1364aa372e3fSPaul Mullowney   template <typename Tuple>
1365aa372e3fSPaul Mullowney   __host__ __device__
1366aa372e3fSPaul Mullowney   void operator()(Tuple t)
1367aa372e3fSPaul Mullowney   {
1368aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1369aa372e3fSPaul Mullowney   }
1370aa372e3fSPaul Mullowney };
1371aa372e3fSPaul Mullowney 
13726fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
13739ae82921SPaul Mullowney {
1374b175d8bbSPaul Mullowney   PetscErrorCode ierr;
13759ae82921SPaul Mullowney 
13769ae82921SPaul Mullowney   PetscFunctionBegin;
13777656d835SStefano Zampini   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
13789ae82921SPaul Mullowney   PetscFunctionReturn(0);
13799ae82921SPaul Mullowney }
13809ae82921SPaul Mullowney 
13816fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1382ca45077fSPaul Mullowney {
1383ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1384aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
13859ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1386465f34aeSAlejandro Lamas Daviña   const PetscScalar            *xarray;
1387465f34aeSAlejandro Lamas Daviña   PetscScalar                  *yarray;
1388b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1389aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1390ca45077fSPaul Mullowney 
1391ca45077fSPaul Mullowney   PetscFunctionBegin;
139234d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
139334d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
13949ff858a8SKarl Rupp   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1395aa372e3fSPaul Mullowney   if (!matstructT) {
1396bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1397aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398bda325fcSPaul Mullowney   }
1399c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1400c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1401aa372e3fSPaul Mullowney 
14027a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1403aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1404aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1405aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1406aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols,
1407b06137fdSPaul Mullowney                              mat->num_entries, matstructT->alpha, matstructT->descr,
1408aa372e3fSPaul Mullowney                              mat->values->data().get(), mat->row_offsets->data().get(),
14097656d835SStefano Zampini                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1410c41cb2e2SAlejandro Lamas Daviña                              yarray);CHKERRCUDA(stat);
1411aa372e3fSPaul Mullowney   } else {
1412aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1413aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414b06137fdSPaul Mullowney                              matstructT->alpha, matstructT->descr, hybMat,
14157656d835SStefano Zampini                              xarray, matstructT->beta_zero,
1416c41cb2e2SAlejandro Lamas Daviña                              yarray);CHKERRCUDA(stat);
1417ca45077fSPaul Mullowney   }
1418c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1419c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1420c41cb2e2SAlejandro Lamas Daviña   ierr = WaitForGPU();CHKERRCUDA(ierr);
1421661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1422958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1423ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1424ca45077fSPaul Mullowney }
1425ca45077fSPaul Mullowney 
1426aa372e3fSPaul Mullowney 
14276fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
14289ae82921SPaul Mullowney {
14299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1430aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
14319ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1432465f34aeSAlejandro Lamas Daviña   const PetscScalar            *xarray;
14337656d835SStefano Zampini   PetscScalar                  *zarray,*dptr,*beta;
1434b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1435aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1436c1fb3f03SStefano Zampini   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
14376e111a19SKarl Rupp 
14389ae82921SPaul Mullowney   PetscFunctionBegin;
143934d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
144034d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
14419ff858a8SKarl Rupp   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
14429ae82921SPaul Mullowney   try {
1443c1fb3f03SStefano Zampini     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1444c41cb2e2SAlejandro Lamas Daviña     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1445c1fb3f03SStefano Zampini     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1446f2d70e9dSBarry Smith       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1447c1fb3f03SStefano Zampini     } else {
1448c1fb3f03SStefano Zampini       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1449c1fb3f03SStefano Zampini     }
1450c1fb3f03SStefano Zampini     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
14517656d835SStefano Zampini     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
14529ae82921SPaul Mullowney 
14537a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
14547656d835SStefano Zampini     /* csr_spmv is multiply add */
1455aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1456b06137fdSPaul Mullowney       /* here we need to be careful to set the number of rows in the multiply to the
1457b06137fdSPaul Mullowney          number of compressed rows in the matrix ... which is equivalent to the
1458b06137fdSPaul Mullowney          size of the workVector */
14597656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1460aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1461a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1462b06137fdSPaul Mullowney                                mat->num_entries, matstruct->alpha, matstruct->descr,
1463aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
14647656d835SStefano Zampini                                mat->column_indices->data().get(), xarray, beta,
14657656d835SStefano Zampini                                dptr);CHKERRCUDA(stat);
1466aa372e3fSPaul Mullowney     } else {
1467a65300a6SPaul Mullowney       if (cusparsestruct->workVector->size()) {
1468301298b4SMark Adams 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1469aa372e3fSPaul Mullowney         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1470b06137fdSPaul Mullowney                                  matstruct->alpha, matstruct->descr, hybMat,
14717656d835SStefano Zampini                                  xarray, beta,
14727656d835SStefano Zampini                                  dptr);CHKERRCUDA(stat);
1473a65300a6SPaul Mullowney       }
1474aa372e3fSPaul Mullowney     }
1475958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1476aa372e3fSPaul Mullowney 
14777656d835SStefano Zampini     if (yy) {
14787656d835SStefano Zampini       if (dptr != zarray) {
14797656d835SStefano Zampini         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
14807656d835SStefano Zampini       } else if (zz != yy) {
14817656d835SStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
14827656d835SStefano Zampini       }
14837656d835SStefano Zampini     } else if (dptr != zarray) {
1484c1fb3f03SStefano Zampini       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
14857656d835SStefano Zampini     }
1486aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
14877a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
14887656d835SStefano Zampini     if (dptr != zarray) {
14897656d835SStefano Zampini       thrust::device_ptr<PetscScalar> zptr;
14907656d835SStefano Zampini 
14917656d835SStefano Zampini       zptr = thrust::device_pointer_cast(zarray);
1492c41cb2e2SAlejandro Lamas Daviña       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1493c41cb2e2SAlejandro Lamas Daviña                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1494c41cb2e2SAlejandro Lamas Daviña                        VecCUDAPlusEquals());
14957656d835SStefano Zampini     }
1496958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1497c41cb2e2SAlejandro Lamas Daviña     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1498c1fb3f03SStefano Zampini     if (yy && !cmpr) {
1499f2d70e9dSBarry Smith       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1500c1fb3f03SStefano Zampini     } else {
1501c1fb3f03SStefano Zampini       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1502c1fb3f03SStefano Zampini     }
15039ae82921SPaul Mullowney   } catch(char *ex) {
15049ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
15059ae82921SPaul Mullowney   }
15067656d835SStefano Zampini   if (!yy) { /* MatMult */
15077656d835SStefano Zampini     if (!cusparsestruct->stream) {
1508c41cb2e2SAlejandro Lamas Daviña       ierr = WaitForGPU();CHKERRCUDA(ierr);
15097656d835SStefano Zampini     }
15107656d835SStefano Zampini   }
1511958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
15129ae82921SPaul Mullowney   PetscFunctionReturn(0);
15139ae82921SPaul Mullowney }
15149ae82921SPaul Mullowney 
15156fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1516ca45077fSPaul Mullowney {
1517ca45077fSPaul Mullowney   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1518aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
15199ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1520465f34aeSAlejandro Lamas Daviña   const PetscScalar               *xarray;
1521a3fdcf43SKarl Rupp   PetscScalar                     *zarray,*dptr,*beta;
1522b175d8bbSPaul Mullowney   PetscErrorCode                  ierr;
1523aa372e3fSPaul Mullowney   cusparseStatus_t                stat;
15246e111a19SKarl Rupp 
1525ca45077fSPaul Mullowney   PetscFunctionBegin;
152634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
152734d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
15289ff858a8SKarl Rupp   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1529aa372e3fSPaul Mullowney   if (!matstructT) {
1530bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1531aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1532bda325fcSPaul Mullowney   }
1533aa372e3fSPaul Mullowney 
1534ca45077fSPaul Mullowney   try {
1535c41cb2e2SAlejandro Lamas Daviña     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1536c41cb2e2SAlejandro Lamas Daviña     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1537f2d70e9dSBarry Smith     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1538a3fdcf43SKarl Rupp     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1539a3fdcf43SKarl Rupp     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1540ca45077fSPaul Mullowney 
15417a052e47Shannah_mairs     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1542e057df02SPaul Mullowney     /* multiply add with matrix transpose */
1543aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1544aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1545b06137fdSPaul Mullowney       /* here we need to be careful to set the number of rows in the multiply to the
1546b06137fdSPaul Mullowney          number of compressed rows in the matrix ... which is equivalent to the
1547b06137fdSPaul Mullowney          size of the workVector */
1548aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1549a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1550b06137fdSPaul Mullowney                                mat->num_entries, matstructT->alpha, matstructT->descr,
1551aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1552a3fdcf43SKarl Rupp                                mat->column_indices->data().get(), xarray, beta,
1553a3fdcf43SKarl Rupp                                dptr);CHKERRCUDA(stat);
1554aa372e3fSPaul Mullowney     } else {
1555aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1556a65300a6SPaul Mullowney       if (cusparsestruct->workVector->size()) {
1557aa372e3fSPaul Mullowney         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1558b06137fdSPaul Mullowney                                  matstructT->alpha, matstructT->descr, hybMat,
1559a3fdcf43SKarl Rupp                                  xarray, beta,
1560a3fdcf43SKarl Rupp                                  dptr);CHKERRCUDA(stat);
1561a65300a6SPaul Mullowney       }
1562aa372e3fSPaul Mullowney     }
1563a3fdcf43SKarl Rupp     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1564a3fdcf43SKarl Rupp 
1565a3fdcf43SKarl Rupp     if (dptr != zarray) {
1566a3fdcf43SKarl Rupp       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1567a3fdcf43SKarl Rupp     } else if (zz != yy) {
1568a3fdcf43SKarl Rupp       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1569a3fdcf43SKarl Rupp     }
1570a3fdcf43SKarl Rupp     /* scatter the data from the temporary into the full vector with a += operation */
1571a3fdcf43SKarl Rupp     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1572a3fdcf43SKarl Rupp     if (dptr != zarray) {
1573a3fdcf43SKarl Rupp       thrust::device_ptr<PetscScalar> zptr;
1574a3fdcf43SKarl Rupp 
1575a3fdcf43SKarl Rupp       zptr = thrust::device_pointer_cast(zarray);
1576aa372e3fSPaul Mullowney 
1577aa372e3fSPaul Mullowney       /* scatter the data from the temporary into the full vector with a += operation */
1578c41cb2e2SAlejandro Lamas Daviña       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1579554b8892SKarl Rupp                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1580c41cb2e2SAlejandro Lamas Daviña                        VecCUDAPlusEquals());
1581a3fdcf43SKarl Rupp     }
1582a3fdcf43SKarl Rupp     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1583c41cb2e2SAlejandro Lamas Daviña     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1584f2d70e9dSBarry Smith     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1585ca45077fSPaul Mullowney   } catch(char *ex) {
1586ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1587661c2d29Shannah_mairs   }
1588a3fdcf43SKarl Rupp   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1589958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1590ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1591ca45077fSPaul Mullowney }
1592ca45077fSPaul Mullowney 
15936fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
15949ae82921SPaul Mullowney {
15959ae82921SPaul Mullowney   PetscErrorCode ierr;
15966e111a19SKarl Rupp 
15979ae82921SPaul Mullowney   PetscFunctionBegin;
15989ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1599489de41dSStefano Zampini   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1600bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1601e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1602bc3f50f2SPaul Mullowney   }
1603bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1604bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1605bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1606bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
16079ae82921SPaul Mullowney   PetscFunctionReturn(0);
16089ae82921SPaul Mullowney }
16099ae82921SPaul Mullowney 
16109ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
1611e057df02SPaul Mullowney /*@
16129ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1613e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1614e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1615e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1616e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1617e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
16189ae82921SPaul Mullowney 
1619d083f849SBarry Smith    Collective
16209ae82921SPaul Mullowney 
16219ae82921SPaul Mullowney    Input Parameters:
16229ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
16239ae82921SPaul Mullowney .  m - number of rows
16249ae82921SPaul Mullowney .  n - number of columns
16259ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
16269ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
16270298fd71SBarry Smith          (possibly different for each row) or NULL
16289ae82921SPaul Mullowney 
16299ae82921SPaul Mullowney    Output Parameter:
16309ae82921SPaul Mullowney .  A - the matrix
16319ae82921SPaul Mullowney 
16329ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
16339ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
16349ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
16359ae82921SPaul Mullowney 
16369ae82921SPaul Mullowney    Notes:
16379ae82921SPaul Mullowney    If nnz is given then nz is ignored
16389ae82921SPaul Mullowney 
16399ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
16409ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
16419ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
16429ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
16439ae82921SPaul Mullowney 
16449ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
16450298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
16469ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
16479ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
16489ae82921SPaul Mullowney 
16499ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
16509ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
16519ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
16529ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
16539ae82921SPaul Mullowney 
16549ae82921SPaul Mullowney    Level: intermediate
16559ae82921SPaul Mullowney 
1656e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
16579ae82921SPaul Mullowney @*/
16589ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
16599ae82921SPaul Mullowney {
16609ae82921SPaul Mullowney   PetscErrorCode ierr;
16619ae82921SPaul Mullowney 
16629ae82921SPaul Mullowney   PetscFunctionBegin;
16639ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
16649ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
16659ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
16669ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
16679ae82921SPaul Mullowney   PetscFunctionReturn(0);
16689ae82921SPaul Mullowney }
16699ae82921SPaul Mullowney 
16706fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
16719ae82921SPaul Mullowney {
16729ae82921SPaul Mullowney   PetscErrorCode   ierr;
1673ab25e6cbSDominic Meiser 
16749ae82921SPaul Mullowney   PetscFunctionBegin;
16759ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1676b8ced49eSKarl Rupp     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1677470880abSPatrick Sanan       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
16789ae82921SPaul Mullowney     }
16799ae82921SPaul Mullowney   } else {
1680470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1681aa372e3fSPaul Mullowney   }
16829ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
16839ae82921SPaul Mullowney   PetscFunctionReturn(0);
16849ae82921SPaul Mullowney }
16859ae82921SPaul Mullowney 
16869ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
16879ff858a8SKarl Rupp {
16889ff858a8SKarl Rupp   PetscErrorCode ierr;
16899ff858a8SKarl Rupp   Mat C;
16909ff858a8SKarl Rupp   cusparseStatus_t stat;
16919ff858a8SKarl Rupp   cusparseHandle_t handle=0;
16929ff858a8SKarl Rupp 
16939ff858a8SKarl Rupp   PetscFunctionBegin;
16949ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
16959ff858a8SKarl Rupp   C    = *B;
169634136279SStefano Zampini   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
169734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
169834136279SStefano Zampini 
16999ff858a8SKarl Rupp   /* inject CUSPARSE-specific stuff */
17009ff858a8SKarl Rupp   if (C->factortype==MAT_FACTOR_NONE) {
17019ff858a8SKarl Rupp     /* you cannot check the inode.use flag here since the matrix was just created.
17029ff858a8SKarl Rupp        now build a GPU matrix data structure */
17039ff858a8SKarl Rupp     C->spptr = new Mat_SeqAIJCUSPARSE;
17049ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
17059ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
17069ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
17079ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
17089ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
17099ff858a8SKarl Rupp     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
17109ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
17119ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
17129ff858a8SKarl Rupp   } else {
17139ff858a8SKarl Rupp     /* NEXT, set the pointers to the triangular factors */
17149ff858a8SKarl Rupp     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
17159ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
17169ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
17179ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
17189ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
17199ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
17209ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
17219ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
17229ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
17239ff858a8SKarl Rupp     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
17249ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
17259ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
17269ff858a8SKarl Rupp   }
17279ff858a8SKarl Rupp 
17289ff858a8SKarl Rupp   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
17299ff858a8SKarl Rupp   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
17309ff858a8SKarl Rupp   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
17319ff858a8SKarl Rupp   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
17329ff858a8SKarl Rupp   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
17339ff858a8SKarl Rupp   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
17349ff858a8SKarl Rupp   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
17359ff858a8SKarl Rupp   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
17369ff858a8SKarl Rupp 
17379ff858a8SKarl Rupp   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
17389ff858a8SKarl Rupp 
1739b8ced49eSKarl Rupp   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
17409ff858a8SKarl Rupp 
17419ff858a8SKarl Rupp   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
17429ff858a8SKarl Rupp   PetscFunctionReturn(0);
17439ff858a8SKarl Rupp }
17449ff858a8SKarl Rupp 
174502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
17469ae82921SPaul Mullowney {
17479ae82921SPaul Mullowney   PetscErrorCode ierr;
1748aa372e3fSPaul Mullowney   cusparseStatus_t stat;
1749aa372e3fSPaul Mullowney   cusparseHandle_t handle=0;
17509ae82921SPaul Mullowney 
17519ae82921SPaul Mullowney   PetscFunctionBegin;
175234136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
175334136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
175434136279SStefano Zampini 
17559ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
1756e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
1757e057df02SPaul Mullowney        now build a GPU matrix data structure */
17589ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
17599ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1760aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1761aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1762e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1763aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1764c41cb2e2SAlejandro Lamas Daviña     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1765aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
17669ff858a8SKarl Rupp     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
17679ae82921SPaul Mullowney   } else {
17689ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
1769debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
17709ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
17719ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1772aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1773aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1774aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1775aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1776aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1777c41cb2e2SAlejandro Lamas Daviña     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1778aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1779aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
17809ae82921SPaul Mullowney   }
1781aa372e3fSPaul Mullowney 
17829ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
17839ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
17849ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1785ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1786ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1787ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1788ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
17899ff858a8SKarl Rupp   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
17902205254eSKarl Rupp 
17919ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
17922205254eSKarl Rupp 
1793b8ced49eSKarl Rupp   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
17942205254eSKarl Rupp 
1795bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
17969ae82921SPaul Mullowney   PetscFunctionReturn(0);
17979ae82921SPaul Mullowney }
17989ae82921SPaul Mullowney 
179902fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
180002fe1965SBarry Smith {
180102fe1965SBarry Smith   PetscErrorCode ierr;
180202fe1965SBarry Smith 
180302fe1965SBarry Smith   PetscFunctionBegin;
180402fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1805489de41dSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
180602fe1965SBarry Smith   PetscFunctionReturn(0);
180702fe1965SBarry Smith }
180802fe1965SBarry Smith 
18093ca39a21SBarry Smith /*MC
1810e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1811e057df02SPaul Mullowney 
1812e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
18132692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
18142692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1815e057df02SPaul Mullowney 
1816e057df02SPaul Mullowney    Options Database Keys:
1817e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1818aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1819a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1820e057df02SPaul Mullowney 
1821e057df02SPaul Mullowney   Level: beginner
1822e057df02SPaul Mullowney 
18238468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1824e057df02SPaul Mullowney M*/
18257f756511SDominic Meiser 
182642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
182742c9c57cSBarry Smith 
18280f39cd5aSBarry Smith 
18293ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
183042c9c57cSBarry Smith {
183142c9c57cSBarry Smith   PetscErrorCode ierr;
183242c9c57cSBarry Smith 
183342c9c57cSBarry Smith   PetscFunctionBegin;
18343ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
18353ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
18363ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
18373ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
183842c9c57cSBarry Smith   PetscFunctionReturn(0);
183942c9c57cSBarry Smith }
184029b38603SBarry Smith 
184181e08676SBarry Smith 
1842470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
18437f756511SDominic Meiser {
18447f756511SDominic Meiser   cusparseStatus_t stat;
18457f756511SDominic Meiser   cusparseHandle_t handle;
18467f756511SDominic Meiser 
18477f756511SDominic Meiser   PetscFunctionBegin;
18487f756511SDominic Meiser   if (*cusparsestruct) {
1849470880abSPatrick Sanan     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1850470880abSPatrick Sanan     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
18517f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
18527f756511SDominic Meiser     if (handle = (*cusparsestruct)->handle) {
1853c41cb2e2SAlejandro Lamas Daviña       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
18547f756511SDominic Meiser     }
18557f756511SDominic Meiser     delete *cusparsestruct;
18567f756511SDominic Meiser     *cusparsestruct = 0;
18577f756511SDominic Meiser   }
18587f756511SDominic Meiser   PetscFunctionReturn(0);
18597f756511SDominic Meiser }
18607f756511SDominic Meiser 
18617f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
18627f756511SDominic Meiser {
18637f756511SDominic Meiser   PetscFunctionBegin;
18647f756511SDominic Meiser   if (*mat) {
18657f756511SDominic Meiser     delete (*mat)->values;
18667f756511SDominic Meiser     delete (*mat)->column_indices;
18677f756511SDominic Meiser     delete (*mat)->row_offsets;
18687f756511SDominic Meiser     delete *mat;
18697f756511SDominic Meiser     *mat = 0;
18707f756511SDominic Meiser   }
18717f756511SDominic Meiser   PetscFunctionReturn(0);
18727f756511SDominic Meiser }
18737f756511SDominic Meiser 
1874470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
18757f756511SDominic Meiser {
18767f756511SDominic Meiser   cusparseStatus_t stat;
18777f756511SDominic Meiser   PetscErrorCode   ierr;
18787f756511SDominic Meiser 
18797f756511SDominic Meiser   PetscFunctionBegin;
18807f756511SDominic Meiser   if (*trifactor) {
1881c41cb2e2SAlejandro Lamas Daviña     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1882c41cb2e2SAlejandro Lamas Daviña     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
18837f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
18847f756511SDominic Meiser     delete *trifactor;
18857f756511SDominic Meiser     *trifactor = 0;
18867f756511SDominic Meiser   }
18877f756511SDominic Meiser   PetscFunctionReturn(0);
18887f756511SDominic Meiser }
18897f756511SDominic Meiser 
1890470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
18917f756511SDominic Meiser {
18927f756511SDominic Meiser   CsrMatrix        *mat;
18937f756511SDominic Meiser   cusparseStatus_t stat;
18947f756511SDominic Meiser   cudaError_t      err;
18957f756511SDominic Meiser 
18967f756511SDominic Meiser   PetscFunctionBegin;
18977f756511SDominic Meiser   if (*matstruct) {
18987f756511SDominic Meiser     if ((*matstruct)->mat) {
18997f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
19007f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1901c41cb2e2SAlejandro Lamas Daviña         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
19027f756511SDominic Meiser       } else {
19037f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
19047f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
19057f756511SDominic Meiser       }
19067f756511SDominic Meiser     }
1907c41cb2e2SAlejandro Lamas Daviña     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
19087f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
1909c41cb2e2SAlejandro Lamas Daviña     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
19107656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
19117656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
19127f756511SDominic Meiser     delete *matstruct;
19137f756511SDominic Meiser     *matstruct = 0;
19147f756511SDominic Meiser   }
19157f756511SDominic Meiser   PetscFunctionReturn(0);
19167f756511SDominic Meiser }
19177f756511SDominic Meiser 
1918470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
19197f756511SDominic Meiser {
19207f756511SDominic Meiser   cusparseHandle_t handle;
19217f756511SDominic Meiser   cusparseStatus_t stat;
19227f756511SDominic Meiser 
19237f756511SDominic Meiser   PetscFunctionBegin;
19247f756511SDominic Meiser   if (*trifactors) {
1925470880abSPatrick Sanan     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1926470880abSPatrick Sanan     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1927470880abSPatrick Sanan     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1928470880abSPatrick Sanan     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
19297f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
19307f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
19317f756511SDominic Meiser     delete (*trifactors)->workVector;
19327f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
1933c41cb2e2SAlejandro Lamas Daviña       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
19347f756511SDominic Meiser     }
19357f756511SDominic Meiser     delete *trifactors;
19367f756511SDominic Meiser     *trifactors = 0;
19377f756511SDominic Meiser   }
19387f756511SDominic Meiser   PetscFunctionReturn(0);
19397f756511SDominic Meiser }
19407f756511SDominic Meiser 
1941