xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 54da937ac53361e7377cf6625207cd5b3a0dbe8c)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
189ae82921SPaul Mullowney 
19087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22087f3262SPaul Mullowney 
236fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
246fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
256fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26087f3262SPaul Mullowney 
276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
306fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
314416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
326fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
336fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
346fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
356fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
399ae82921SPaul Mullowney 
407f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
467f756511SDominic Meiser 
47b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48b06137fdSPaul Mullowney {
49b06137fdSPaul Mullowney   cusparseStatus_t   stat;
50b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51b06137fdSPaul Mullowney 
52b06137fdSPaul Mullowney   PetscFunctionBegin;
53b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
5457d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55b06137fdSPaul Mullowney   PetscFunctionReturn(0);
56b06137fdSPaul Mullowney }
57b06137fdSPaul Mullowney 
58b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59b06137fdSPaul Mullowney {
60b06137fdSPaul Mullowney   cusparseStatus_t   stat;
61b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62b06137fdSPaul Mullowney 
63b06137fdSPaul Mullowney   PetscFunctionBegin;
646b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
6516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
6657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
6716a2e217SAlejandro Lamas Daviña     }
68b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
696b1cf21dSAlejandro Lamas Daviña   }
7057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71b06137fdSPaul Mullowney   PetscFunctionReturn(0);
72b06137fdSPaul Mullowney }
73b06137fdSPaul Mullowney 
74b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75b06137fdSPaul Mullowney {
76b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77ccdfe979SStefano Zampini 
78b06137fdSPaul Mullowney   PetscFunctionBegin;
79ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80b06137fdSPaul Mullowney   PetscFunctionReturn(0);
81b06137fdSPaul Mullowney }
82b06137fdSPaul Mullowney 
83ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
849ae82921SPaul Mullowney {
859ae82921SPaul Mullowney   PetscFunctionBegin;
869ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
879ae82921SPaul Mullowney   PetscFunctionReturn(0);
889ae82921SPaul Mullowney }
899ae82921SPaul Mullowney 
90c708e6cdSJed Brown /*MC
91087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
97c708e6cdSJed Brown 
989ae82921SPaul Mullowney   Level: beginner
99c708e6cdSJed Brown 
1003ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101c708e6cdSJed Brown M*/
1029ae82921SPaul Mullowney 
10342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1049ae82921SPaul Mullowney {
1059ae82921SPaul Mullowney   PetscErrorCode ierr;
106bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1079ae82921SPaul Mullowney 
1089ae82921SPaul Mullowney   PetscFunctionBegin;
109bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110404133a2SPaul Mullowney   (*B)->factortype = ftype;
111bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1129ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1132205254eSKarl Rupp 
114087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
11533d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1169ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1179ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1219ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122bc3f50f2SPaul Mullowney 
123fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1243ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1259ae82921SPaul Mullowney   PetscFunctionReturn(0);
1269ae82921SPaul Mullowney }
1279ae82921SPaul Mullowney 
128bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
129ca45077fSPaul Mullowney {
130aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1316e111a19SKarl Rupp 
132ca45077fSPaul Mullowney   PetscFunctionBegin;
133ca45077fSPaul Mullowney   switch (op) {
134e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
135aa372e3fSPaul Mullowney     cusparsestruct->format = format;
136ca45077fSPaul Mullowney     break;
137e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
138aa372e3fSPaul Mullowney     cusparsestruct->format = format;
139ca45077fSPaul Mullowney     break;
140ca45077fSPaul Mullowney   default:
14136d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
142ca45077fSPaul Mullowney   }
143ca45077fSPaul Mullowney   PetscFunctionReturn(0);
144ca45077fSPaul Mullowney }
1459ae82921SPaul Mullowney 
146e057df02SPaul Mullowney /*@
147e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
148e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
149aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
150e057df02SPaul Mullowney    Not Collective
151e057df02SPaul Mullowney 
152e057df02SPaul Mullowney    Input Parameters:
1538468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
15436d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1552692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
156e057df02SPaul Mullowney 
157e057df02SPaul Mullowney    Output Parameter:
158e057df02SPaul Mullowney 
159e057df02SPaul Mullowney    Level: intermediate
160e057df02SPaul Mullowney 
1618468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
162e057df02SPaul Mullowney @*/
163e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
164e057df02SPaul Mullowney {
165e057df02SPaul Mullowney   PetscErrorCode ierr;
1666e111a19SKarl Rupp 
167e057df02SPaul Mullowney   PetscFunctionBegin;
168e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
170e057df02SPaul Mullowney   PetscFunctionReturn(0);
171e057df02SPaul Mullowney }
172e057df02SPaul Mullowney 
173e6e9a74fSStefano Zampini /*@
174e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
175e6e9a74fSStefano Zampini 
176e6e9a74fSStefano Zampini    Collective on mat
177e6e9a74fSStefano Zampini 
178e6e9a74fSStefano Zampini    Input Parameters:
179e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
180e6e9a74fSStefano Zampini -  transgen - the boolean flag
181e6e9a74fSStefano Zampini 
182e6e9a74fSStefano Zampini    Level: intermediate
183e6e9a74fSStefano Zampini 
184e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
185e6e9a74fSStefano Zampini @*/
186e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
187e6e9a74fSStefano Zampini {
188e6e9a74fSStefano Zampini   PetscErrorCode ierr;
189e6e9a74fSStefano Zampini   PetscBool      flg;
190e6e9a74fSStefano Zampini 
191e6e9a74fSStefano Zampini   PetscFunctionBegin;
192e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
193e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
194e6e9a74fSStefano Zampini   if (flg) {
195e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
196*54da937aSStefano Zampini 
197e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
198e6e9a74fSStefano Zampini     cusp->transgen = transgen;
199*54da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
200*54da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
201*54da937aSStefano Zampini     }
202e6e9a74fSStefano Zampini   }
203e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
204e6e9a74fSStefano Zampini }
205e6e9a74fSStefano Zampini 
2064416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2079ae82921SPaul Mullowney {
2089ae82921SPaul Mullowney   PetscErrorCode           ierr;
209e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2109ae82921SPaul Mullowney   PetscBool                flg;
211a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2126e111a19SKarl Rupp 
2139ae82921SPaul Mullowney   PetscFunctionBegin;
214e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2159ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
216*54da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
217*54da937aSStefano Zampini 
218*54da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
219*54da937aSStefano Zampini     if (flg) {
220*54da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);
221*54da937aSStefano Zampini     }
222e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
223a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
224e057df02SPaul Mullowney     if (flg) {
225e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
226045c96e1SPaul Mullowney     }
2279ae82921SPaul Mullowney   }
2284c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
229a183c035SDominic Meiser                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
2304c87dfd4SPaul Mullowney   if (flg) {
2314c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
2324c87dfd4SPaul Mullowney   }
2330af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2349ae82921SPaul Mullowney   PetscFunctionReturn(0);
2359ae82921SPaul Mullowney }
2369ae82921SPaul Mullowney 
2376fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2389ae82921SPaul Mullowney {
2399ae82921SPaul Mullowney   PetscErrorCode ierr;
2409ae82921SPaul Mullowney 
2419ae82921SPaul Mullowney   PetscFunctionBegin;
2429ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2439ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2449ae82921SPaul Mullowney   PetscFunctionReturn(0);
2459ae82921SPaul Mullowney }
2469ae82921SPaul Mullowney 
2476fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2489ae82921SPaul Mullowney {
2499ae82921SPaul Mullowney   PetscErrorCode ierr;
2509ae82921SPaul Mullowney 
2519ae82921SPaul Mullowney   PetscFunctionBegin;
2529ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2539ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2549ae82921SPaul Mullowney   PetscFunctionReturn(0);
2559ae82921SPaul Mullowney }
2569ae82921SPaul Mullowney 
257087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
258087f3262SPaul Mullowney {
259087f3262SPaul Mullowney   PetscErrorCode ierr;
260087f3262SPaul Mullowney 
261087f3262SPaul Mullowney   PetscFunctionBegin;
262087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
263087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
264087f3262SPaul Mullowney   PetscFunctionReturn(0);
265087f3262SPaul Mullowney }
266087f3262SPaul Mullowney 
267087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
268087f3262SPaul Mullowney {
269087f3262SPaul Mullowney   PetscErrorCode ierr;
270087f3262SPaul Mullowney 
271087f3262SPaul Mullowney   PetscFunctionBegin;
272087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
273087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
274087f3262SPaul Mullowney   PetscFunctionReturn(0);
275087f3262SPaul Mullowney }
276087f3262SPaul Mullowney 
277087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2789ae82921SPaul Mullowney {
2799ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2809ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2819ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
282aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2839ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2849ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2859ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2869ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2879ae82921SPaul Mullowney   PetscScalar                       *AALo;
2889ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
289b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
29057d48284SJunchao Zhang   cudaError_t                       cerr;
2919ae82921SPaul Mullowney 
2929ae82921SPaul Mullowney   PetscFunctionBegin;
293cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
294c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2959ae82921SPaul Mullowney     try {
2969ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2979ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2989ae82921SPaul Mullowney 
2999ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
30057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
30157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
30257d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3039ae82921SPaul Mullowney 
3049ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
3059ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
3069ae82921SPaul Mullowney       AiLo[n]  = nzLower;
3079ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
3089ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
3099ae82921SPaul Mullowney       v        = aa;
3109ae82921SPaul Mullowney       vi       = aj;
3119ae82921SPaul Mullowney       offset   = 1;
3129ae82921SPaul Mullowney       rowOffset= 1;
3139ae82921SPaul Mullowney       for (i=1; i<n; i++) {
3149ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
315e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
3169ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
3179ae82921SPaul Mullowney         rowOffset += nz+1;
3189ae82921SPaul Mullowney 
319580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
320580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3219ae82921SPaul Mullowney 
3229ae82921SPaul Mullowney         offset      += nz;
3239ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3249ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3259ae82921SPaul Mullowney         offset      += 1;
3269ae82921SPaul Mullowney 
3279ae82921SPaul Mullowney         v  += nz;
3289ae82921SPaul Mullowney         vi += nz;
3299ae82921SPaul Mullowney       }
3302205254eSKarl Rupp 
331aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
332aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3332205254eSKarl Rupp 
334aa372e3fSPaul Mullowney       /* Create the matrix description */
33557d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
33657d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
33757d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
33857d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
33957d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
340aa372e3fSPaul Mullowney 
341aa372e3fSPaul Mullowney       /* Create the solve analysis information */
34257d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
343aa372e3fSPaul Mullowney 
344aa372e3fSPaul Mullowney       /* set the operation */
345aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
346aa372e3fSPaul Mullowney 
347aa372e3fSPaul Mullowney       /* set the matrix */
348aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
349aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
350aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
351aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
352aa372e3fSPaul Mullowney 
353aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
354aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
355aa372e3fSPaul Mullowney 
356aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
357aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
358aa372e3fSPaul Mullowney 
359aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
360aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
361aa372e3fSPaul Mullowney 
362aa372e3fSPaul Mullowney       /* perform the solve analysis */
363aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
36657d48284SJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
367aa372e3fSPaul Mullowney 
368aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
369aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3702205254eSKarl Rupp 
37157d48284SJunchao Zhang       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
37257d48284SJunchao Zhang       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
37357d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
3744863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
3759ae82921SPaul Mullowney     } catch(char *ex) {
3769ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3779ae82921SPaul Mullowney     }
3789ae82921SPaul Mullowney   }
3799ae82921SPaul Mullowney   PetscFunctionReturn(0);
3809ae82921SPaul Mullowney }
3819ae82921SPaul Mullowney 
382087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3839ae82921SPaul Mullowney {
3849ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3859ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3869ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
387aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3889ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3899ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3909ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3919ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3929ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3939ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3949ae82921SPaul Mullowney   PetscErrorCode                    ierr;
39557d48284SJunchao Zhang   cudaError_t                       cerr;
3969ae82921SPaul Mullowney 
3979ae82921SPaul Mullowney   PetscFunctionBegin;
398cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
399c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4009ae82921SPaul Mullowney     try {
4019ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4029ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
4039ae82921SPaul Mullowney 
4049ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
40557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
40657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
40757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
4089ae82921SPaul Mullowney 
4099ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
4109ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
4119ae82921SPaul Mullowney       AiUp[n]=nzUpper;
4129ae82921SPaul Mullowney       offset = nzUpper;
4139ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
4149ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
4159ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
4169ae82921SPaul Mullowney 
417e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
4189ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
4199ae82921SPaul Mullowney 
420e057df02SPaul Mullowney         /* decrement the offset */
4219ae82921SPaul Mullowney         offset -= (nz+1);
4229ae82921SPaul Mullowney 
423e057df02SPaul Mullowney         /* first, set the diagonal elements */
4249ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
42509f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
4269ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4279ae82921SPaul Mullowney 
428580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
429580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
4309ae82921SPaul Mullowney       }
4312205254eSKarl Rupp 
432aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
433aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4342205254eSKarl Rupp 
435aa372e3fSPaul Mullowney       /* Create the matrix description */
43657d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
43757d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
43857d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
43957d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
44057d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
441aa372e3fSPaul Mullowney 
442aa372e3fSPaul Mullowney       /* Create the solve analysis information */
44357d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
444aa372e3fSPaul Mullowney 
445aa372e3fSPaul Mullowney       /* set the operation */
446aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
447aa372e3fSPaul Mullowney 
448aa372e3fSPaul Mullowney       /* set the matrix */
449aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
450aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
451aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
452aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
453aa372e3fSPaul Mullowney 
454aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
455aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
456aa372e3fSPaul Mullowney 
457aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
458aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
459aa372e3fSPaul Mullowney 
460aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
461aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
462aa372e3fSPaul Mullowney 
463aa372e3fSPaul Mullowney       /* perform the solve analysis */
464aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
465aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
466aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
46757d48284SJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
468aa372e3fSPaul Mullowney 
469aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
470aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4712205254eSKarl Rupp 
47257d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
47357d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
47457d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
4754863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
4769ae82921SPaul Mullowney     } catch(char *ex) {
4779ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4789ae82921SPaul Mullowney     }
4799ae82921SPaul Mullowney   }
4809ae82921SPaul Mullowney   PetscFunctionReturn(0);
4819ae82921SPaul Mullowney }
4829ae82921SPaul Mullowney 
483087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4849ae82921SPaul Mullowney {
4859ae82921SPaul Mullowney   PetscErrorCode               ierr;
4869ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4879ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4889ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4899ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4909ae82921SPaul Mullowney   const PetscInt               *r,*c;
4919ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4929ae82921SPaul Mullowney 
4939ae82921SPaul Mullowney   PetscFunctionBegin;
494ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
495087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
496087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4972205254eSKarl Rupp 
498e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
499aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
5009ae82921SPaul Mullowney 
501c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
502e057df02SPaul Mullowney   /* lower triangular indices */
5039ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5049ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5052205254eSKarl Rupp   if (!row_identity) {
506aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
507aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
5082205254eSKarl Rupp   }
5099ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5109ae82921SPaul Mullowney 
511e057df02SPaul Mullowney   /* upper triangular indices */
5129ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
5139ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5142205254eSKarl Rupp   if (!col_identity) {
515aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
516aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
5172205254eSKarl Rupp   }
5184863603aSSatish Balay 
5194863603aSSatish Balay   if (!row_identity && !col_identity) {
5204863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
5214863603aSSatish Balay   } else if(!row_identity) {
5224863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
5234863603aSSatish Balay   } else if(!col_identity) {
5244863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
5254863603aSSatish Balay   }
5264863603aSSatish Balay 
5279ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
5289ae82921SPaul Mullowney   PetscFunctionReturn(0);
5299ae82921SPaul Mullowney }
5309ae82921SPaul Mullowney 
531087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
532087f3262SPaul Mullowney {
533087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
534087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
535aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
536aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
537087f3262SPaul Mullowney   cusparseStatus_t                  stat;
538087f3262SPaul Mullowney   PetscErrorCode                    ierr;
53957d48284SJunchao Zhang   cudaError_t                       cerr;
540087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
541087f3262SPaul Mullowney   PetscScalar                       *AAUp;
542087f3262SPaul Mullowney   PetscScalar                       *AALo;
543087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
544087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
545087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
546087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
547087f3262SPaul Mullowney 
548087f3262SPaul Mullowney   PetscFunctionBegin;
549cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
550c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
551087f3262SPaul Mullowney     try {
552087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
55357d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
55457d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
55557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
55657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
557087f3262SPaul Mullowney 
558087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
559087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
560087f3262SPaul Mullowney       AiUp[n]=nzUpper;
561087f3262SPaul Mullowney       offset = 0;
562087f3262SPaul Mullowney       for (i=0; i<n; i++) {
563087f3262SPaul Mullowney         /* set the pointers */
564087f3262SPaul Mullowney         v  = aa + ai[i];
565087f3262SPaul Mullowney         vj = aj + ai[i];
566087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
567087f3262SPaul Mullowney 
568087f3262SPaul Mullowney         /* first, set the diagonal elements */
569087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
57009f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
571087f3262SPaul Mullowney         AiUp[i]      = offset;
57209f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
573087f3262SPaul Mullowney 
574087f3262SPaul Mullowney         offset+=1;
575087f3262SPaul Mullowney         if (nz>0) {
576f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
577580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
578087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
579087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
580087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
581087f3262SPaul Mullowney           }
582087f3262SPaul Mullowney           offset+=nz;
583087f3262SPaul Mullowney         }
584087f3262SPaul Mullowney       }
585087f3262SPaul Mullowney 
586aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
587aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
588087f3262SPaul Mullowney 
589aa372e3fSPaul Mullowney       /* Create the matrix description */
59057d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
59157d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
59257d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
59357d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
59457d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
595087f3262SPaul Mullowney 
596aa372e3fSPaul Mullowney       /* Create the solve analysis information */
59757d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
598aa372e3fSPaul Mullowney 
599aa372e3fSPaul Mullowney       /* set the operation */
600aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
601aa372e3fSPaul Mullowney 
602aa372e3fSPaul Mullowney       /* set the matrix */
603aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
604aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
605aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
606aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
607aa372e3fSPaul Mullowney 
608aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
609aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
610aa372e3fSPaul Mullowney 
611aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
612aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
613aa372e3fSPaul Mullowney 
614aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
615aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
616aa372e3fSPaul Mullowney 
617aa372e3fSPaul Mullowney       /* perform the solve analysis */
618aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
619aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
620aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
62157d48284SJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622aa372e3fSPaul Mullowney 
623aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
624aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625aa372e3fSPaul Mullowney 
626aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
627aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
628aa372e3fSPaul Mullowney 
629aa372e3fSPaul Mullowney       /* Create the matrix description */
63057d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
63157d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
63257d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
63357d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
63457d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
635aa372e3fSPaul Mullowney 
636aa372e3fSPaul Mullowney       /* Create the solve analysis information */
63757d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
638aa372e3fSPaul Mullowney 
639aa372e3fSPaul Mullowney       /* set the operation */
640aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
641aa372e3fSPaul Mullowney 
642aa372e3fSPaul Mullowney       /* set the matrix */
643aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
644aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
645aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
646aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
647aa372e3fSPaul Mullowney 
648aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
649aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
650aa372e3fSPaul Mullowney 
651aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
652aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
653aa372e3fSPaul Mullowney 
654aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
655aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
6564863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
657aa372e3fSPaul Mullowney 
658aa372e3fSPaul Mullowney       /* perform the solve analysis */
659aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
660aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
661aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
66257d48284SJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
663aa372e3fSPaul Mullowney 
664aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
665aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
666087f3262SPaul Mullowney 
667c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
66857d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
66957d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
67057d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
67157d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
672087f3262SPaul Mullowney     } catch(char *ex) {
673087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
674087f3262SPaul Mullowney     }
675087f3262SPaul Mullowney   }
676087f3262SPaul Mullowney   PetscFunctionReturn(0);
677087f3262SPaul Mullowney }
678087f3262SPaul Mullowney 
679087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6809ae82921SPaul Mullowney {
6819ae82921SPaul Mullowney   PetscErrorCode               ierr;
682087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
683087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684087f3262SPaul Mullowney   IS                           ip = a->row;
685087f3262SPaul Mullowney   const PetscInt               *rip;
686087f3262SPaul Mullowney   PetscBool                    perm_identity;
687087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
688087f3262SPaul Mullowney 
689087f3262SPaul Mullowney   PetscFunctionBegin;
690ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
691087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
692e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
693aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
694aa372e3fSPaul Mullowney 
695087f3262SPaul Mullowney   /* lower triangular indices */
696087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
697087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698087f3262SPaul Mullowney   if (!perm_identity) {
6994e4bbfaaSStefano Zampini     IS             iip;
7004e4bbfaaSStefano Zampini     const PetscInt *irip;
7014e4bbfaaSStefano Zampini 
7024e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
7034e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
704aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
705aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
706aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7074e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
7084e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
7094e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
7104863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
711087f3262SPaul Mullowney   }
712087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
713087f3262SPaul Mullowney   PetscFunctionReturn(0);
714087f3262SPaul Mullowney }
715087f3262SPaul Mullowney 
7166fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
7179ae82921SPaul Mullowney {
7189ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
7199ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
7209ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
721b175d8bbSPaul Mullowney   PetscErrorCode ierr;
7229ae82921SPaul Mullowney 
7239ae82921SPaul Mullowney   PetscFunctionBegin;
7249ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
725ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
726e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
7279ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
7289ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
729bda325fcSPaul Mullowney   if (row_identity && col_identity) {
730bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
731bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
7324e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7334e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
734bda325fcSPaul Mullowney   } else {
735bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
736bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
7374e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7384e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
739bda325fcSPaul Mullowney   }
7408dc1d2a3SPaul Mullowney 
741e057df02SPaul Mullowney   /* get the triangular factors */
742087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
7439ae82921SPaul Mullowney   PetscFunctionReturn(0);
7449ae82921SPaul Mullowney }
7459ae82921SPaul Mullowney 
746087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
747087f3262SPaul Mullowney {
748087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
749087f3262SPaul Mullowney   IS             ip = b->row;
750087f3262SPaul Mullowney   PetscBool      perm_identity;
751b175d8bbSPaul Mullowney   PetscErrorCode ierr;
752087f3262SPaul Mullowney 
753087f3262SPaul Mullowney   PetscFunctionBegin;
754087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
755ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
756087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
757087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
758087f3262SPaul Mullowney   if (perm_identity) {
759087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
760087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
7614e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7624e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
763087f3262SPaul Mullowney   } else {
764087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
765087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
7664e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7674e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
768087f3262SPaul Mullowney   }
769087f3262SPaul Mullowney 
770087f3262SPaul Mullowney   /* get the triangular factors */
771087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
772087f3262SPaul Mullowney   PetscFunctionReturn(0);
773087f3262SPaul Mullowney }
7749ae82921SPaul Mullowney 
775b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
776bda325fcSPaul Mullowney {
777bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
778aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
779aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
780aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
781aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
782bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
783aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
784aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
785aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
786aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
787b175d8bbSPaul Mullowney 
788bda325fcSPaul Mullowney   PetscFunctionBegin;
789bda325fcSPaul Mullowney 
790aa372e3fSPaul Mullowney   /*********************************************/
791aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
792aa372e3fSPaul Mullowney   /*********************************************/
793aa372e3fSPaul Mullowney 
794aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
795aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
796aa372e3fSPaul Mullowney 
797aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
798aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
799aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
800aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
801aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
802aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
803aa372e3fSPaul Mullowney 
804aa372e3fSPaul Mullowney   /* Create the matrix description */
80557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
80657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
80757d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
80857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
80957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
810aa372e3fSPaul Mullowney 
811aa372e3fSPaul Mullowney   /* Create the solve analysis information */
81257d48284SJunchao Zhang   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
813aa372e3fSPaul Mullowney 
814aa372e3fSPaul Mullowney   /* set the operation */
815aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
816aa372e3fSPaul Mullowney 
817aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
818aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
819aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
820aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
821aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
822aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
823aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
824aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
825aa372e3fSPaul Mullowney 
826aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
827aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
828aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
829aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
830aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
831aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
832aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
833aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
834aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
83557d48284SJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
836aa372e3fSPaul Mullowney 
837aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
838aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
839aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
840aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
841aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
84257d48284SJunchao Zhang                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
845aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney   /*********************************************/
848aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
849aa372e3fSPaul Mullowney   /*********************************************/
850aa372e3fSPaul Mullowney 
851aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
852aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
853aa372e3fSPaul Mullowney 
854aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
855aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
856aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
857aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
858aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
859aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
860aa372e3fSPaul Mullowney 
861aa372e3fSPaul Mullowney   /* Create the matrix description */
86257d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
86357d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
86457d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
86557d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
86657d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
867aa372e3fSPaul Mullowney 
868aa372e3fSPaul Mullowney   /* Create the solve analysis information */
86957d48284SJunchao Zhang   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
870aa372e3fSPaul Mullowney 
871aa372e3fSPaul Mullowney   /* set the operation */
872aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
873aa372e3fSPaul Mullowney 
874aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
875aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
876aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
877aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
878aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
879aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
880aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
881aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
882aa372e3fSPaul Mullowney 
883aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
884aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
885aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
886aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
887aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
888aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
889aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
890aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
891aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
89257d48284SJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
893aa372e3fSPaul Mullowney 
894aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
895aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
896aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
897aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
898aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
89957d48284SJunchao Zhang                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
900aa372e3fSPaul Mullowney 
901aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
902aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
903bda325fcSPaul Mullowney   PetscFunctionReturn(0);
904bda325fcSPaul Mullowney }
905bda325fcSPaul Mullowney 
906b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
907bda325fcSPaul Mullowney {
908aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
909aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
910aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
911bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
912bda325fcSPaul Mullowney   cusparseStatus_t             stat;
913aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
914b06137fdSPaul Mullowney   cudaError_t                  err;
91585ba7357SStefano Zampini   PetscErrorCode               ierr;
916b175d8bbSPaul Mullowney 
917bda325fcSPaul Mullowney   PetscFunctionBegin;
91885ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
91985ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
92085ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
92185ba7357SStefano Zampini   /* create cusparse matrix */
922aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
92357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
924aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
92557d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
92657d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
927aa372e3fSPaul Mullowney 
928b06137fdSPaul Mullowney   /* set alpha and beta */
929c41cb2e2SAlejandro Lamas Daviña   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
9307656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
9317656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
9327656d835SStefano Zampini   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
9337656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
9347656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
93557d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
936b06137fdSPaul Mullowney 
937aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
938aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
939aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
940554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
941554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
942aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
943a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
944aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
945aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
946a3fdcf43SKarl Rupp 
94781902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
94881902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
94981902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
950aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
951a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
952a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
953aa372e3fSPaul Mullowney                             matrix->values->data().get(),
95481902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
955aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
956aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
957aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
958aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
95957d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
960aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
961aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
962aa372e3fSPaul Mullowney     CsrMatrix *temp = new CsrMatrix;
96351c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
96451c6d536SStefano Zampini 
96551c6d536SStefano Zampini     /* First convert HYB to CSR */
966aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
967aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
968aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
969aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
970aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
971aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
972aa372e3fSPaul Mullowney 
973aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
974aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
975aa372e3fSPaul Mullowney                             temp->values->data().get(),
976aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
97757d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
978aa372e3fSPaul Mullowney 
979aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
980aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
981aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
982aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
983aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
984aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
985aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
986aa372e3fSPaul Mullowney 
987aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
988aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
989aa372e3fSPaul Mullowney                             temp->values->data().get(),
990aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
991aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
992aa372e3fSPaul Mullowney                             tempT->values->data().get(),
993aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
994aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
99557d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
996aa372e3fSPaul Mullowney 
997aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
998aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
99957d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1000aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1001aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1002aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1003aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1004aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1005aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
100657d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1007aa372e3fSPaul Mullowney 
1008aa372e3fSPaul Mullowney     /* assign the pointer */
1009aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1010aa372e3fSPaul Mullowney     /* delete temporaries */
1011aa372e3fSPaul Mullowney     if (tempT) {
1012aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1013aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1014aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1015aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1016087f3262SPaul Mullowney     }
1017aa372e3fSPaul Mullowney     if (temp) {
1018aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1019aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1020aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1021aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1022aa372e3fSPaul Mullowney     }
1023aa372e3fSPaul Mullowney   }
102485ba7357SStefano Zampini   err  = WaitForGPU();CHKERRCUDA(err);
102585ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
102685ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1027213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1028213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1029aa372e3fSPaul Mullowney   /* assign the pointer */
1030aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1031bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1032bda325fcSPaul Mullowney }
1033bda325fcSPaul Mullowney 
10344e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
10356fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1036bda325fcSPaul Mullowney {
1037c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1038465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1039465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1040465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1041465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1042bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1043bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1044aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1047b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
104857d48284SJunchao Zhang   cudaError_t                           cerr;
1049bda325fcSPaul Mullowney 
1050bda325fcSPaul Mullowney   PetscFunctionBegin;
1051aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1052aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1053bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1054aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1055aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1056bda325fcSPaul Mullowney   }
1057bda325fcSPaul Mullowney 
1058bda325fcSPaul Mullowney   /* Get the GPU pointers */
1059c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1060c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1061c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1062c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1063bda325fcSPaul Mullowney 
10647a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1065aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1066c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1067c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1068c41cb2e2SAlejandro Lamas Daviña                xGPU);
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* First, solve U */
1071aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
10727656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1073aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1074aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1075aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1076aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
107757d48284SJunchao Zhang                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1078aa372e3fSPaul Mullowney 
1079aa372e3fSPaul Mullowney   /* Then, solve L */
1080aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
10817656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1082aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1083aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1084aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1085aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
108657d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1087aa372e3fSPaul Mullowney 
1088aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1089c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1090c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1091aa372e3fSPaul Mullowney                tempGPU->begin());
1092aa372e3fSPaul Mullowney 
1093aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1094c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1095bda325fcSPaul Mullowney 
1096bda325fcSPaul Mullowney   /* restore */
1097c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1098c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
109957d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1100661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1101958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1102bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1103bda325fcSPaul Mullowney }
1104bda325fcSPaul Mullowney 
11056fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1106bda325fcSPaul Mullowney {
1107465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1108465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1109bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1110bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1111aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1112aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1113aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1114b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
111557d48284SJunchao Zhang   cudaError_t                       cerr;
1116bda325fcSPaul Mullowney 
1117bda325fcSPaul Mullowney   PetscFunctionBegin;
1118aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1119aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1120bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1121aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1122aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1123bda325fcSPaul Mullowney   }
1124bda325fcSPaul Mullowney 
1125bda325fcSPaul Mullowney   /* Get the GPU pointers */
1126c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1127c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1128bda325fcSPaul Mullowney 
11297a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1130aa372e3fSPaul Mullowney   /* First, solve U */
1131aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
11327656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1133aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1134aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1135aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1136aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
113757d48284SJunchao Zhang                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1138aa372e3fSPaul Mullowney 
1139aa372e3fSPaul Mullowney   /* Then, solve L */
1140aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
11417656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1142aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1143aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1144aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1145aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
114657d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1147bda325fcSPaul Mullowney 
1148bda325fcSPaul Mullowney   /* restore */
1149c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1150c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
115157d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1152661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1153958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1154bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1155bda325fcSPaul Mullowney }
1156bda325fcSPaul Mullowney 
11576fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
11589ae82921SPaul Mullowney {
1159465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1160465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1161465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1162465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
11639ae82921SPaul Mullowney   cusparseStatus_t                      stat;
11649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1165aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1166aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1167aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1168b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
116957d48284SJunchao Zhang   cudaError_t                           cerr;
11709ae82921SPaul Mullowney 
11719ae82921SPaul Mullowney   PetscFunctionBegin;
1172ebc8f436SDominic Meiser 
1173e057df02SPaul Mullowney   /* Get the GPU pointers */
1174c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1175c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1176c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1177c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
11789ae82921SPaul Mullowney 
11797a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1180aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1181c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1182c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
11834e4bbfaaSStefano Zampini                tempGPU->begin());
1184aa372e3fSPaul Mullowney 
1185aa372e3fSPaul Mullowney   /* Next, solve L */
1186aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
11877656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1188aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1189aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1190aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1191aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
119257d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1193aa372e3fSPaul Mullowney 
1194aa372e3fSPaul Mullowney   /* Then, solve U */
1195aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
11967656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1197aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1198aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1199aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1200aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
120157d48284SJunchao Zhang                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1202aa372e3fSPaul Mullowney 
12034e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
12044e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
12054e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
12064e4bbfaaSStefano Zampini                xGPU);
12079ae82921SPaul Mullowney 
1208c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1209c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
121057d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1211661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1212958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12139ae82921SPaul Mullowney   PetscFunctionReturn(0);
12149ae82921SPaul Mullowney }
12159ae82921SPaul Mullowney 
12166fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
12179ae82921SPaul Mullowney {
1218465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1219465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
12209ae82921SPaul Mullowney   cusparseStatus_t                  stat;
12219ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1222aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1223aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1224aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1225b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
122657d48284SJunchao Zhang   cudaError_t                       cerr;
12279ae82921SPaul Mullowney 
12289ae82921SPaul Mullowney   PetscFunctionBegin;
1229e057df02SPaul Mullowney   /* Get the GPU pointers */
1230c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1231c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
12329ae82921SPaul Mullowney 
12337a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1234aa372e3fSPaul Mullowney   /* First, solve L */
1235aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
12367656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1237aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1238aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1239aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1240aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
124157d48284SJunchao Zhang                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1242aa372e3fSPaul Mullowney 
1243aa372e3fSPaul Mullowney   /* Next, solve U */
1244aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
12457656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1246aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1247aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1248aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1249aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
125057d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
12519ae82921SPaul Mullowney 
1252c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1253c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
125457d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1255661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1256958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12579ae82921SPaul Mullowney   PetscFunctionReturn(0);
12589ae82921SPaul Mullowney }
12599ae82921SPaul Mullowney 
12606fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
12619ae82921SPaul Mullowney {
1262aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
12637c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
12649ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1265213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
12669ae82921SPaul Mullowney   PetscErrorCode               ierr;
1267aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1268b06137fdSPaul Mullowney   cudaError_t                  err;
12699ae82921SPaul Mullowney 
12709ae82921SPaul Mullowney   PetscFunctionBegin;
127195639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1272c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
127381902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
127481902715SJunchao Zhang       /* Copy values only */
127581902715SJunchao Zhang       CsrMatrix *mat,*matT;
127681902715SJunchao Zhang       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
127785ba7357SStefano Zampini 
127885ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
127985ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
128081902715SJunchao Zhang       mat->values->assign(a->a, a->a+a->nz);
128185ba7357SStefano Zampini       err  = WaitForGPU();CHKERRCUDA(err);
128285ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
12834863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
128485ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
128581902715SJunchao Zhang 
128681902715SJunchao Zhang       /* Update matT when it was built before */
128781902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
128881902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
128981902715SJunchao Zhang         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
129085ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
129185ba7357SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
129281902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
129381902715SJunchao Zhang                                  A->cmap->n, mat->num_entries,
129481902715SJunchao Zhang                                  mat->values->data().get(),
129581902715SJunchao Zhang                                  cusparsestruct->rowoffsets_gpu->data().get(),
129681902715SJunchao Zhang                                  mat->column_indices->data().get(),
129781902715SJunchao Zhang                                  matT->values->data().get(),
129881902715SJunchao Zhang                                  matT->column_indices->data().get(),
129981902715SJunchao Zhang                                  matT->row_offsets->data().get(),
13009f74ec24SSatish Balay                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
130185ba7357SStefano Zampini         err  = WaitForGPU();CHKERRCUDA(err);
130285ba7357SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
130385ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
130481902715SJunchao Zhang       }
130534d6c7a5SJose E. Roman     } else {
130685ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
130785ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
13087c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
13097c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
13107c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
131181902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
13129ae82921SPaul Mullowney       try {
13139ae82921SPaul Mullowney         if (a->compressedrow.use) {
13149ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
13159ae82921SPaul Mullowney           ii   = a->compressedrow.i;
13169ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
13179ae82921SPaul Mullowney         } else {
1318213423ffSJunchao Zhang           m    = A->rmap->n;
1319213423ffSJunchao Zhang           ii   = a->i;
1320e6e9a74fSStefano Zampini           ridx = NULL;
13219ae82921SPaul Mullowney         }
1322213423ffSJunchao Zhang         cusparsestruct->nrows = m;
13239ae82921SPaul Mullowney 
132485ba7357SStefano Zampini         /* create cusparse matrix */
1325aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
132657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
132757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
132857d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
13299ae82921SPaul Mullowney 
1330c41cb2e2SAlejandro Lamas Daviña         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
13317656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
13327656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
13337656d835SStefano Zampini         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13347656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13357656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
133657d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1337b06137fdSPaul Mullowney 
1338aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1339aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1340aa372e3fSPaul Mullowney           /* set the matrix */
1341aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1342a65300a6SPaul Mullowney           matrix->num_rows = m;
1343aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1344aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1345a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1346a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
13479ae82921SPaul Mullowney 
1348aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1349aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1350aa372e3fSPaul Mullowney 
1351aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1352aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1353aa372e3fSPaul Mullowney 
1354aa372e3fSPaul Mullowney           /* assign the pointer */
1355aa372e3fSPaul Mullowney           matstruct->mat = matrix;
1356aa372e3fSPaul Mullowney 
1357aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1358aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1359a65300a6SPaul Mullowney           matrix->num_rows = m;
1360aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1361aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1362a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1363a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
1364aa372e3fSPaul Mullowney 
1365aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1366aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1367aa372e3fSPaul Mullowney 
1368aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1369aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1370aa372e3fSPaul Mullowney 
1371aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
137257d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1373aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1374aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1375a65300a6SPaul Mullowney           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1376aa372e3fSPaul Mullowney               matstruct->descr, matrix->values->data().get(),
1377aa372e3fSPaul Mullowney               matrix->row_offsets->data().get(),
1378aa372e3fSPaul Mullowney               matrix->column_indices->data().get(),
137957d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1380aa372e3fSPaul Mullowney           /* assign the pointer */
1381aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1382aa372e3fSPaul Mullowney 
1383aa372e3fSPaul Mullowney           if (matrix) {
1384aa372e3fSPaul Mullowney             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1385aa372e3fSPaul Mullowney             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1386aa372e3fSPaul Mullowney             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1387aa372e3fSPaul Mullowney             delete (CsrMatrix*)matrix;
1388087f3262SPaul Mullowney           }
1389087f3262SPaul Mullowney         }
1390ca45077fSPaul Mullowney 
1391aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1392213423ffSJunchao Zhang         if (a->compressedrow.use) {
1393213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1394aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1395aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1396213423ffSJunchao Zhang           tmp = m;
1397213423ffSJunchao Zhang         } else {
1398213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1399213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1400213423ffSJunchao Zhang           tmp = 0;
1401213423ffSJunchao Zhang         }
1402213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1403aa372e3fSPaul Mullowney 
1404aa372e3fSPaul Mullowney         /* assign the pointer */
1405aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
14069ae82921SPaul Mullowney       } catch(char *ex) {
14079ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
14089ae82921SPaul Mullowney       }
140985ba7357SStefano Zampini       err  = WaitForGPU();CHKERRCUDA(err);
141085ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
141185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
141234d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
141334d6c7a5SJose E. Roman     }
1414c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
14159ae82921SPaul Mullowney   }
14169ae82921SPaul Mullowney   PetscFunctionReturn(0);
14179ae82921SPaul Mullowney }
14189ae82921SPaul Mullowney 
1419c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1420aa372e3fSPaul Mullowney {
1421aa372e3fSPaul Mullowney   template <typename Tuple>
1422aa372e3fSPaul Mullowney   __host__ __device__
1423aa372e3fSPaul Mullowney   void operator()(Tuple t)
1424aa372e3fSPaul Mullowney   {
1425aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1426aa372e3fSPaul Mullowney   }
1427aa372e3fSPaul Mullowney };
1428aa372e3fSPaul Mullowney 
1429e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1430e6e9a74fSStefano Zampini {
1431e6e9a74fSStefano Zampini   template <typename Tuple>
1432e6e9a74fSStefano Zampini   __host__ __device__
1433e6e9a74fSStefano Zampini   void operator()(Tuple t)
1434e6e9a74fSStefano Zampini   {
1435e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1436e6e9a74fSStefano Zampini   }
1437e6e9a74fSStefano Zampini };
1438e6e9a74fSStefano Zampini 
1439ccdfe979SStefano Zampini typedef struct {
1440ccdfe979SStefano Zampini   PetscBool   cisdense;
1441ccdfe979SStefano Zampini   PetscScalar *Bt;
1442ccdfe979SStefano Zampini   Mat         X;
1443ccdfe979SStefano Zampini } MatMatCusparse;
1444ccdfe979SStefano Zampini 
1445ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1446ccdfe979SStefano Zampini {
1447ccdfe979SStefano Zampini   PetscErrorCode ierr;
1448ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1449ccdfe979SStefano Zampini   cudaError_t    cerr;
1450ccdfe979SStefano Zampini 
1451ccdfe979SStefano Zampini   PetscFunctionBegin;
1452ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1453ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1454ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1455ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1456ccdfe979SStefano Zampini }
1457ccdfe979SStefano Zampini 
1458ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1459ccdfe979SStefano Zampini 
1460ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1461ccdfe979SStefano Zampini {
1462ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1463ccdfe979SStefano Zampini   Mat                          A,B;
1464ccdfe979SStefano Zampini   PetscInt                     m,n,k,blda,clda;
1465ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1466ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1467ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1468ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1469ccdfe979SStefano Zampini   cusparseMatDescr_t           matA;
1470ccdfe979SStefano Zampini   const PetscScalar            *barray;
1471ccdfe979SStefano Zampini   PetscScalar                  *carray;
1472ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1473ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1474ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1475ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1476c8378d12SStefano Zampini   cudaError_t                  cuer;
1477ccdfe979SStefano Zampini 
1478ccdfe979SStefano Zampini   PetscFunctionBegin;
1479ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1480ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1481ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1482ccdfe979SStefano Zampini   A    = product->A;
1483ccdfe979SStefano Zampini   B    = product->B;
1484ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1485ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1486ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1487ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1488ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1489ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1490ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1491ccdfe979SStefano Zampini   switch (product->type) {
1492ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1493ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1494ccdfe979SStefano Zampini     mat = cusp->mat;
1495ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1496ccdfe979SStefano Zampini     m   = A->rmap->n;
1497ccdfe979SStefano Zampini     k   = A->cmap->n;
1498ccdfe979SStefano Zampini     n   = B->cmap->n;
1499ccdfe979SStefano Zampini     break;
1500ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1501e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1502e6e9a74fSStefano Zampini       mat = cusp->mat;
1503e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1504e6e9a74fSStefano Zampini     } else {
1505ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1506ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1507ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1508e6e9a74fSStefano Zampini     }
1509ccdfe979SStefano Zampini     m = A->cmap->n;
1510ccdfe979SStefano Zampini     k = A->rmap->n;
1511ccdfe979SStefano Zampini     n = B->cmap->n;
1512ccdfe979SStefano Zampini     break;
1513ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1514ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1515ccdfe979SStefano Zampini     mat = cusp->mat;
1516ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1517ccdfe979SStefano Zampini     m   = A->rmap->n;
1518ccdfe979SStefano Zampini     k   = B->cmap->n;
1519ccdfe979SStefano Zampini     n   = B->rmap->n;
1520ccdfe979SStefano Zampini     break;
1521ccdfe979SStefano Zampini   default:
1522ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1523ccdfe979SStefano Zampini   }
1524ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1525ccdfe979SStefano Zampini   matA   = mat->descr;
1526ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1527ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1528ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1529ccdfe979SStefano Zampini   if (!biscuda) {
1530ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1531ccdfe979SStefano Zampini   }
1532ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1533ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1534c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1535c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1536c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1537c8378d12SStefano Zampini   } else {
1538c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1539c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1540c8378d12SStefano Zampini   }
1541c8378d12SStefano Zampini 
1542c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1543c8378d12SStefano Zampini 
1544ccdfe979SStefano Zampini   /* cusparse spmm does not support transpose on B */
1545ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1546ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
1547ccdfe979SStefano Zampini     cublasStatus_t cerr;
1548ccdfe979SStefano Zampini 
1549ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1550ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1551ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
1552ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
1553ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
1554ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1555ccdfe979SStefano Zampini     blda = B->cmap->n;
1556ccdfe979SStefano Zampini   }
1557ccdfe979SStefano Zampini 
1558ccdfe979SStefano Zampini   /* perform the MatMat operation */
1559ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1560ccdfe979SStefano Zampini                            csrmat->num_entries,mat->alpha,matA,
1561ccdfe979SStefano Zampini                            csrmat->values->data().get(),
1562ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
1563ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
1564ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1565ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
1566c8378d12SStefano Zampini   cuer = WaitForGPU();CHKERRCUDA(cuer);
1567c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1568c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1569ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1570ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
1571ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1572ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1573ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
1574ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1575ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1576ccdfe979SStefano Zampini   } else {
1577ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1578ccdfe979SStefano Zampini   }
1579ccdfe979SStefano Zampini   if (mmdata->cisdense) {
1580ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1581ccdfe979SStefano Zampini   }
1582ccdfe979SStefano Zampini   if (!biscuda) {
1583ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1584ccdfe979SStefano Zampini   }
1585ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1586ccdfe979SStefano Zampini }
1587ccdfe979SStefano Zampini 
1588ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1589ccdfe979SStefano Zampini {
1590ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
1591ccdfe979SStefano Zampini   Mat                A,B;
1592ccdfe979SStefano Zampini   PetscInt           m,n;
1593ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
1594ccdfe979SStefano Zampini   PetscErrorCode     ierr;
1595ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
1596ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
1597ccdfe979SStefano Zampini 
1598ccdfe979SStefano Zampini   PetscFunctionBegin;
1599ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1600ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1601ccdfe979SStefano Zampini   A    = product->A;
1602ccdfe979SStefano Zampini   B    = product->B;
1603ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1604ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1605ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1606ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1607ccdfe979SStefano Zampini   switch (product->type) {
1608ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1609ccdfe979SStefano Zampini     m = A->rmap->n;
1610ccdfe979SStefano Zampini     n = B->cmap->n;
1611ccdfe979SStefano Zampini     break;
1612ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1613ccdfe979SStefano Zampini     m = A->cmap->n;
1614ccdfe979SStefano Zampini     n = B->cmap->n;
1615ccdfe979SStefano Zampini     break;
1616ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1617ccdfe979SStefano Zampini     m = A->rmap->n;
1618ccdfe979SStefano Zampini     n = B->rmap->n;
1619ccdfe979SStefano Zampini     break;
1620ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1621ccdfe979SStefano Zampini     m = B->cmap->n;
1622ccdfe979SStefano Zampini     n = B->cmap->n;
1623ccdfe979SStefano Zampini     break;
1624ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1625ccdfe979SStefano Zampini     m = B->rmap->n;
1626ccdfe979SStefano Zampini     n = B->rmap->n;
1627ccdfe979SStefano Zampini     break;
1628ccdfe979SStefano Zampini   default:
1629ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1630ccdfe979SStefano Zampini   }
1631ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1632ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1633ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1634ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1635ccdfe979SStefano Zampini 
1636ccdfe979SStefano Zampini   /* product data */
1637ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1638ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
1639ccdfe979SStefano Zampini   /* cusparse spmm does not support transpose on B */
1640ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1641ccdfe979SStefano Zampini     cudaError_t cerr;
1642ccdfe979SStefano Zampini 
1643ccdfe979SStefano Zampini     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1644ccdfe979SStefano Zampini   }
1645ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
1646ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1647ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1648ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1649ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1650ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1651ccdfe979SStefano Zampini     } else {
1652ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1653ccdfe979SStefano Zampini     }
1654ccdfe979SStefano Zampini   }
1655ccdfe979SStefano Zampini   C->product->data    = mmdata;
1656ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
1657ccdfe979SStefano Zampini 
1658ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1659ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1660ccdfe979SStefano Zampini }
1661ccdfe979SStefano Zampini 
1662ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1663ccdfe979SStefano Zampini 
1664ccdfe979SStefano Zampini /* handles dense B */
1665ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1666ccdfe979SStefano Zampini {
1667ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
1668ccdfe979SStefano Zampini   PetscErrorCode ierr;
1669ccdfe979SStefano Zampini 
1670ccdfe979SStefano Zampini   PetscFunctionBegin;
1671ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1672ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1673ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
1674ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1675ccdfe979SStefano Zampini     PetscFunctionReturn(0);
1676ccdfe979SStefano Zampini   }
1677ccdfe979SStefano Zampini   switch (product->type) {
1678ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1679ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1680ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1681ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1682ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1683ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1684ccdfe979SStefano Zampini   default:
1685ccdfe979SStefano Zampini     break;
1686ccdfe979SStefano Zampini   }
1687ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1688ccdfe979SStefano Zampini }
1689ccdfe979SStefano Zampini 
16906fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
16919ae82921SPaul Mullowney {
1692b175d8bbSPaul Mullowney   PetscErrorCode ierr;
16939ae82921SPaul Mullowney 
16949ae82921SPaul Mullowney   PetscFunctionBegin;
1695e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1696e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1697e6e9a74fSStefano Zampini }
1698e6e9a74fSStefano Zampini 
1699e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1700e6e9a74fSStefano Zampini {
1701e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1702e6e9a74fSStefano Zampini 
1703e6e9a74fSStefano Zampini   PetscFunctionBegin;
1704e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1705e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1706e6e9a74fSStefano Zampini }
1707e6e9a74fSStefano Zampini 
1708e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1709e6e9a74fSStefano Zampini {
1710e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1711e6e9a74fSStefano Zampini 
1712e6e9a74fSStefano Zampini   PetscFunctionBegin;
1713e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1714e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1715e6e9a74fSStefano Zampini }
1716e6e9a74fSStefano Zampini 
1717e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1718e6e9a74fSStefano Zampini {
1719e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1720e6e9a74fSStefano Zampini 
1721e6e9a74fSStefano Zampini   PetscFunctionBegin;
1722e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
17239ae82921SPaul Mullowney   PetscFunctionReturn(0);
17249ae82921SPaul Mullowney }
17259ae82921SPaul Mullowney 
17266fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1727ca45077fSPaul Mullowney {
1728b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1729ca45077fSPaul Mullowney 
1730ca45077fSPaul Mullowney   PetscFunctionBegin;
1731e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1732ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1733ca45077fSPaul Mullowney }
1734ca45077fSPaul Mullowney 
1735e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
17369ae82921SPaul Mullowney {
17379ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1738aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17399ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1740e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1741b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
174257d48284SJunchao Zhang   cudaError_t                  cerr;
1743aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1744e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1745e6e9a74fSStefano Zampini   PetscBool                    compressed;
17466e111a19SKarl Rupp 
17479ae82921SPaul Mullowney   PetscFunctionBegin;
1748e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1749e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
1750e6e9a74fSStefano Zampini     if (!yy) {
1751e6e9a74fSStefano Zampini       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1752e6e9a74fSStefano Zampini     }
1753e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
1754e6e9a74fSStefano Zampini   }
175534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
175634d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1757e6e9a74fSStefano Zampini   if (!trans) {
17589ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1759e6e9a74fSStefano Zampini   } else {
1760e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
1761e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1762e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1763e6e9a74fSStefano Zampini     } else {
1764e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1765e6e9a74fSStefano Zampini       if (!matstruct) {
1766e6e9a74fSStefano Zampini         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1767e6e9a74fSStefano Zampini         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1768e6e9a74fSStefano Zampini       }
1769e6e9a74fSStefano Zampini     }
1770e6e9a74fSStefano Zampini   }
1771e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1772e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1773213423ffSJunchao Zhang 
1774e6e9a74fSStefano Zampini   try {
1775e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1776213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1777213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
177885ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1779e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1780e6e9a74fSStefano Zampini       xptr = xarray;
1781e6e9a74fSStefano Zampini       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1782213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1783e6e9a74fSStefano Zampini     } else {
1784e6e9a74fSStefano Zampini       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1785e6e9a74fSStefano Zampini       dptr = zarray;
1786e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1787e6e9a74fSStefano Zampini 
1788e6e9a74fSStefano Zampini       if (compressed) {
1789e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1790e6e9a74fSStefano Zampini 
1791e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1792e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1793e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
1794e6e9a74fSStefano Zampini       }
1795e6e9a74fSStefano Zampini     }
17969ae82921SPaul Mullowney 
1797213423ffSJunchao Zhang     /* csr_spmv does y = alpha*Ax + beta*y */
1798aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
17997656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1800e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1801a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1802b06137fdSPaul Mullowney                                mat->num_entries, matstruct->alpha, matstruct->descr,
1803aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1804e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
180557d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
1806aa372e3fSPaul Mullowney     } else {
1807213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
1808301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1809e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1810b06137fdSPaul Mullowney                                  matstruct->alpha, matstruct->descr, hybMat,
1811e6e9a74fSStefano Zampini                                  xptr, beta,
181257d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
1813a65300a6SPaul Mullowney       }
1814aa372e3fSPaul Mullowney     }
1815e6e9a74fSStefano Zampini     cerr = WaitForGPU();CHKERRCUDA(cerr);
1816958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1817aa372e3fSPaul Mullowney 
1818e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1819213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1820213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1821213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1822e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1823213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
18247656d835SStefano Zampini         }
1825213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1826c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
18277656d835SStefano Zampini       }
18287656d835SStefano Zampini 
1829213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1830213423ffSJunchao Zhang       if (compressed) {
1831213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1832e6e9a74fSStefano Zampini 
1833e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1834c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1835e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1836c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
1837e6e9a74fSStefano Zampini         cerr = WaitForGPU();CHKERRCUDA(cerr);
1838958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1839e6e9a74fSStefano Zampini       }
1840e6e9a74fSStefano Zampini     } else {
1841e6e9a74fSStefano Zampini       if (yy && yy != zz) {
1842e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1843e6e9a74fSStefano Zampini       }
1844e6e9a74fSStefano Zampini     }
1845e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1846213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1847213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
18489ae82921SPaul Mullowney   } catch(char *ex) {
18499ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18509ae82921SPaul Mullowney   }
1851e6e9a74fSStefano Zampini   if (yy) {
1852958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1853e6e9a74fSStefano Zampini   } else {
1854e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1855e6e9a74fSStefano Zampini   }
18569ae82921SPaul Mullowney   PetscFunctionReturn(0);
18579ae82921SPaul Mullowney }
18589ae82921SPaul Mullowney 
18596fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1860ca45077fSPaul Mullowney {
1861b175d8bbSPaul Mullowney   PetscErrorCode ierr;
18626e111a19SKarl Rupp 
1863ca45077fSPaul Mullowney   PetscFunctionBegin;
1864e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1865ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1866ca45077fSPaul Mullowney }
1867ca45077fSPaul Mullowney 
18686fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
18699ae82921SPaul Mullowney {
18709ae82921SPaul Mullowney   PetscErrorCode ierr;
18716e111a19SKarl Rupp 
18729ae82921SPaul Mullowney   PetscFunctionBegin;
18739ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
187495639643SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1875bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1876e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1877bc3f50f2SPaul Mullowney   }
18789ae82921SPaul Mullowney   PetscFunctionReturn(0);
18799ae82921SPaul Mullowney }
18809ae82921SPaul Mullowney 
18819ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
1882e057df02SPaul Mullowney /*@
18839ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1884e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1885e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1886e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1887e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1888e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
18899ae82921SPaul Mullowney 
1890d083f849SBarry Smith    Collective
18919ae82921SPaul Mullowney 
18929ae82921SPaul Mullowney    Input Parameters:
18939ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
18949ae82921SPaul Mullowney .  m - number of rows
18959ae82921SPaul Mullowney .  n - number of columns
18969ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
18979ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
18980298fd71SBarry Smith          (possibly different for each row) or NULL
18999ae82921SPaul Mullowney 
19009ae82921SPaul Mullowney    Output Parameter:
19019ae82921SPaul Mullowney .  A - the matrix
19029ae82921SPaul Mullowney 
19039ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
19049ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
19059ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
19069ae82921SPaul Mullowney 
19079ae82921SPaul Mullowney    Notes:
19089ae82921SPaul Mullowney    If nnz is given then nz is ignored
19099ae82921SPaul Mullowney 
19109ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
19119ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
19129ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
19139ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
19149ae82921SPaul Mullowney 
19159ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
19160298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
19179ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
19189ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
19199ae82921SPaul Mullowney 
19209ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
19219ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
19229ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
19239ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
19249ae82921SPaul Mullowney 
19259ae82921SPaul Mullowney    Level: intermediate
19269ae82921SPaul Mullowney 
1927e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
19289ae82921SPaul Mullowney @*/
19299ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
19309ae82921SPaul Mullowney {
19319ae82921SPaul Mullowney   PetscErrorCode ierr;
19329ae82921SPaul Mullowney 
19339ae82921SPaul Mullowney   PetscFunctionBegin;
19349ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
19359ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
19369ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
19379ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
19389ae82921SPaul Mullowney   PetscFunctionReturn(0);
19399ae82921SPaul Mullowney }
19409ae82921SPaul Mullowney 
19416fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
19429ae82921SPaul Mullowney {
19439ae82921SPaul Mullowney   PetscErrorCode ierr;
1944ab25e6cbSDominic Meiser 
19459ae82921SPaul Mullowney   PetscFunctionBegin;
19469ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1947470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
19489ae82921SPaul Mullowney   } else {
1949470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1950aa372e3fSPaul Mullowney   }
1951ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1952ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1953ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1954ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
19559ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
19569ae82921SPaul Mullowney   PetscFunctionReturn(0);
19579ae82921SPaul Mullowney }
19589ae82921SPaul Mullowney 
1959ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
196095639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
19619ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
19629ff858a8SKarl Rupp {
19639ff858a8SKarl Rupp   PetscErrorCode ierr;
19649ff858a8SKarl Rupp 
19659ff858a8SKarl Rupp   PetscFunctionBegin;
19669ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1967ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
19689ff858a8SKarl Rupp   PetscFunctionReturn(0);
19699ff858a8SKarl Rupp }
19709ff858a8SKarl Rupp 
197195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
197295639643SRichard Tran Mills {
1973e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1974e6e9a74fSStefano Zampini 
197595639643SRichard Tran Mills   PetscFunctionBegin;
1976e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1977c34f1ff0SRichard Tran Mills   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
197895639643SRichard Tran Mills      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1979e6e9a74fSStefano Zampini      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
198095639643SRichard Tran Mills      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
198195639643SRichard Tran Mills            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1982e6e9a74fSStefano Zampini   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1983e6e9a74fSStefano Zampini   /* TODO: add support for this? */
198495639643SRichard Tran Mills   if (flg) {
198595639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
198695639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1987c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1988c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1989e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
1990e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
1991e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1992e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
199395639643SRichard Tran Mills   } else {
199495639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
199595639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
199695639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
199795639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1998e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
1999e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2000e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2001e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
200295639643SRichard Tran Mills   }
200395639643SRichard Tran Mills   A->boundtocpu = flg;
200495639643SRichard Tran Mills   PetscFunctionReturn(0);
200595639643SRichard Tran Mills }
200695639643SRichard Tran Mills 
200749735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
20089ae82921SPaul Mullowney {
20099ae82921SPaul Mullowney   PetscErrorCode   ierr;
2010aa372e3fSPaul Mullowney   cusparseStatus_t stat;
201149735bf3SStefano Zampini   Mat              B;
20129ae82921SPaul Mullowney 
20139ae82921SPaul Mullowney   PetscFunctionBegin;
201449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
201549735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
201649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
201749735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
201849735bf3SStefano Zampini   }
201949735bf3SStefano Zampini   B = *newmat;
202049735bf3SStefano Zampini 
202134136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
202234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
202334136279SStefano Zampini 
202449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
20259ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2026e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2027e6e9a74fSStefano Zampini 
2028e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2029e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2030e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2031e6e9a74fSStefano Zampini       B->spptr = spptr;
20329ae82921SPaul Mullowney     } else {
2033e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2034e6e9a74fSStefano Zampini 
2035e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2036e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2037e6e9a74fSStefano Zampini       B->spptr = spptr;
20389ae82921SPaul Mullowney     }
2039e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
204049735bf3SStefano Zampini   }
2041693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
20429ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
20439ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
204495639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2045693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
20462205254eSKarl Rupp 
2047e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
20489ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2049bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
20509ae82921SPaul Mullowney   PetscFunctionReturn(0);
20519ae82921SPaul Mullowney }
20529ae82921SPaul Mullowney 
205302fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
205402fe1965SBarry Smith {
205502fe1965SBarry Smith   PetscErrorCode ierr;
205602fe1965SBarry Smith 
205702fe1965SBarry Smith   PetscFunctionBegin;
205802fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
20590ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
206002fe1965SBarry Smith   PetscFunctionReturn(0);
206102fe1965SBarry Smith }
206202fe1965SBarry Smith 
20633ca39a21SBarry Smith /*MC
2064e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2065e057df02SPaul Mullowney 
2066e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
20672692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
20682692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2069e057df02SPaul Mullowney 
2070e057df02SPaul Mullowney    Options Database Keys:
2071e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2072aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2073a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2074e057df02SPaul Mullowney 
2075e057df02SPaul Mullowney   Level: beginner
2076e057df02SPaul Mullowney 
20778468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2078e057df02SPaul Mullowney M*/
20797f756511SDominic Meiser 
208042c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
208142c9c57cSBarry Smith 
20820f39cd5aSBarry Smith 
20833ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
208442c9c57cSBarry Smith {
208542c9c57cSBarry Smith   PetscErrorCode ierr;
208642c9c57cSBarry Smith 
208742c9c57cSBarry Smith   PetscFunctionBegin;
20883ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20903ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20913ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
209242c9c57cSBarry Smith   PetscFunctionReturn(0);
209342c9c57cSBarry Smith }
209429b38603SBarry Smith 
2095470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
20967f756511SDominic Meiser {
2097e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
20987f756511SDominic Meiser   cusparseStatus_t stat;
20997f756511SDominic Meiser   cusparseHandle_t handle;
21007f756511SDominic Meiser 
21017f756511SDominic Meiser   PetscFunctionBegin;
21027f756511SDominic Meiser   if (*cusparsestruct) {
2103e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2104e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
21057f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
210681902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
21077f756511SDominic Meiser     if (handle = (*cusparsestruct)->handle) {
210857d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
21097f756511SDominic Meiser     }
2110e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
21117f756511SDominic Meiser   }
21127f756511SDominic Meiser   PetscFunctionReturn(0);
21137f756511SDominic Meiser }
21147f756511SDominic Meiser 
21157f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
21167f756511SDominic Meiser {
21177f756511SDominic Meiser   PetscFunctionBegin;
21187f756511SDominic Meiser   if (*mat) {
21197f756511SDominic Meiser     delete (*mat)->values;
21207f756511SDominic Meiser     delete (*mat)->column_indices;
21217f756511SDominic Meiser     delete (*mat)->row_offsets;
21227f756511SDominic Meiser     delete *mat;
21237f756511SDominic Meiser     *mat = 0;
21247f756511SDominic Meiser   }
21257f756511SDominic Meiser   PetscFunctionReturn(0);
21267f756511SDominic Meiser }
21277f756511SDominic Meiser 
2128470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
21297f756511SDominic Meiser {
21307f756511SDominic Meiser   cusparseStatus_t stat;
21317f756511SDominic Meiser   PetscErrorCode   ierr;
21327f756511SDominic Meiser 
21337f756511SDominic Meiser   PetscFunctionBegin;
21347f756511SDominic Meiser   if (*trifactor) {
213557d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
213657d48284SJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
21377f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
21387f756511SDominic Meiser     delete *trifactor;
21397f756511SDominic Meiser     *trifactor = 0;
21407f756511SDominic Meiser   }
21417f756511SDominic Meiser   PetscFunctionReturn(0);
21427f756511SDominic Meiser }
21437f756511SDominic Meiser 
2144470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
21457f756511SDominic Meiser {
21467f756511SDominic Meiser   CsrMatrix        *mat;
21477f756511SDominic Meiser   cusparseStatus_t stat;
21487f756511SDominic Meiser   cudaError_t      err;
21497f756511SDominic Meiser 
21507f756511SDominic Meiser   PetscFunctionBegin;
21517f756511SDominic Meiser   if (*matstruct) {
21527f756511SDominic Meiser     if ((*matstruct)->mat) {
21537f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
21547f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
215557d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
21567f756511SDominic Meiser       } else {
21577f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
21587f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
21597f756511SDominic Meiser       }
21607f756511SDominic Meiser     }
216157d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
21627f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2163c41cb2e2SAlejandro Lamas Daviña     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
21647656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
21657656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
21667f756511SDominic Meiser     delete *matstruct;
21677f756511SDominic Meiser     *matstruct = 0;
21687f756511SDominic Meiser   }
21697f756511SDominic Meiser   PetscFunctionReturn(0);
21707f756511SDominic Meiser }
21717f756511SDominic Meiser 
2172ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
21737f756511SDominic Meiser {
2174e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2175e6e9a74fSStefano Zampini 
21767f756511SDominic Meiser   PetscFunctionBegin;
21777f756511SDominic Meiser   if (*trifactors) {
2178e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2179e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2180e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2181e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
21827f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
21837f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
21847f756511SDominic Meiser     delete (*trifactors)->workVector;
2185ccdfe979SStefano Zampini     (*trifactors)->rpermIndices = 0;
2186ccdfe979SStefano Zampini     (*trifactors)->cpermIndices = 0;
2187ccdfe979SStefano Zampini     (*trifactors)->workVector = 0;
2188ccdfe979SStefano Zampini   }
2189ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2190ccdfe979SStefano Zampini }
2191ccdfe979SStefano Zampini 
2192ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2193ccdfe979SStefano Zampini {
2194e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2195ccdfe979SStefano Zampini   cusparseHandle_t handle;
2196ccdfe979SStefano Zampini   cusparseStatus_t stat;
2197ccdfe979SStefano Zampini 
2198ccdfe979SStefano Zampini   PetscFunctionBegin;
2199ccdfe979SStefano Zampini   if (*trifactors) {
2200e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
22017f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
220257d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
22037f756511SDominic Meiser     }
2204e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
22057f756511SDominic Meiser   }
22067f756511SDominic Meiser   PetscFunctionReturn(0);
22077f756511SDominic Meiser }
2208