xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2c7c0729d89b910088ee906aed2239ea927d033e)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
189ae82921SPaul Mullowney 
19087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22087f3262SPaul Mullowney 
236fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
246fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
256fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26087f3262SPaul Mullowney 
276fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
306fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
314416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
326fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
336fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
346fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
356fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
399ae82921SPaul Mullowney 
407f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
467f756511SDominic Meiser 
47b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48b06137fdSPaul Mullowney {
49b06137fdSPaul Mullowney   cusparseStatus_t   stat;
50b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
51b06137fdSPaul Mullowney 
52b06137fdSPaul Mullowney   PetscFunctionBegin;
53b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
5457d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55b06137fdSPaul Mullowney   PetscFunctionReturn(0);
56b06137fdSPaul Mullowney }
57b06137fdSPaul Mullowney 
58b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59b06137fdSPaul Mullowney {
60b06137fdSPaul Mullowney   cusparseStatus_t   stat;
61b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
62b06137fdSPaul Mullowney 
63b06137fdSPaul Mullowney   PetscFunctionBegin;
646b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
6516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
6657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
6716a2e217SAlejandro Lamas Daviña     }
68b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
696b1cf21dSAlejandro Lamas Daviña   }
7057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71b06137fdSPaul Mullowney   PetscFunctionReturn(0);
72b06137fdSPaul Mullowney }
73b06137fdSPaul Mullowney 
74b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75b06137fdSPaul Mullowney {
76b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
77ccdfe979SStefano Zampini 
78b06137fdSPaul Mullowney   PetscFunctionBegin;
79ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
80b06137fdSPaul Mullowney   PetscFunctionReturn(0);
81b06137fdSPaul Mullowney }
82b06137fdSPaul Mullowney 
83ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
849ae82921SPaul Mullowney {
859ae82921SPaul Mullowney   PetscFunctionBegin;
869ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
879ae82921SPaul Mullowney   PetscFunctionReturn(0);
889ae82921SPaul Mullowney }
899ae82921SPaul Mullowney 
90c708e6cdSJed Brown /*MC
91087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
97c708e6cdSJed Brown 
989ae82921SPaul Mullowney   Level: beginner
99c708e6cdSJed Brown 
1003ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101c708e6cdSJed Brown M*/
1029ae82921SPaul Mullowney 
10342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1049ae82921SPaul Mullowney {
1059ae82921SPaul Mullowney   PetscErrorCode ierr;
106bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1079ae82921SPaul Mullowney 
1089ae82921SPaul Mullowney   PetscFunctionBegin;
109bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
111*2c7c0729SBarry Smith   (*B)->factortype = ftype;
112*2c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1139ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1142205254eSKarl Rupp 
115087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
11633d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1179ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1189ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1229ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
123bc3f50f2SPaul Mullowney 
124fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1253ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1269ae82921SPaul Mullowney   PetscFunctionReturn(0);
1279ae82921SPaul Mullowney }
1289ae82921SPaul Mullowney 
129bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
130ca45077fSPaul Mullowney {
131aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1326e111a19SKarl Rupp 
133ca45077fSPaul Mullowney   PetscFunctionBegin;
134ca45077fSPaul Mullowney   switch (op) {
135e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
136aa372e3fSPaul Mullowney     cusparsestruct->format = format;
137ca45077fSPaul Mullowney     break;
138e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
139aa372e3fSPaul Mullowney     cusparsestruct->format = format;
140ca45077fSPaul Mullowney     break;
141ca45077fSPaul Mullowney   default:
14236d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
143ca45077fSPaul Mullowney   }
144ca45077fSPaul Mullowney   PetscFunctionReturn(0);
145ca45077fSPaul Mullowney }
1469ae82921SPaul Mullowney 
147e057df02SPaul Mullowney /*@
148e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
150aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
151e057df02SPaul Mullowney    Not Collective
152e057df02SPaul Mullowney 
153e057df02SPaul Mullowney    Input Parameters:
1548468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
15536d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1562692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157e057df02SPaul Mullowney 
158e057df02SPaul Mullowney    Output Parameter:
159e057df02SPaul Mullowney 
160e057df02SPaul Mullowney    Level: intermediate
161e057df02SPaul Mullowney 
1628468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163e057df02SPaul Mullowney @*/
164e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165e057df02SPaul Mullowney {
166e057df02SPaul Mullowney   PetscErrorCode ierr;
1676e111a19SKarl Rupp 
168e057df02SPaul Mullowney   PetscFunctionBegin;
169e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
170e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
171e057df02SPaul Mullowney   PetscFunctionReturn(0);
172e057df02SPaul Mullowney }
173e057df02SPaul Mullowney 
174e6e9a74fSStefano Zampini /*@
175e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
176e6e9a74fSStefano Zampini 
177e6e9a74fSStefano Zampini    Collective on mat
178e6e9a74fSStefano Zampini 
179e6e9a74fSStefano Zampini    Input Parameters:
180e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
181e6e9a74fSStefano Zampini -  transgen - the boolean flag
182e6e9a74fSStefano Zampini 
183e6e9a74fSStefano Zampini    Level: intermediate
184e6e9a74fSStefano Zampini 
185e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
186e6e9a74fSStefano Zampini @*/
187e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
188e6e9a74fSStefano Zampini {
189e6e9a74fSStefano Zampini   PetscErrorCode ierr;
190e6e9a74fSStefano Zampini   PetscBool      flg;
191e6e9a74fSStefano Zampini 
192e6e9a74fSStefano Zampini   PetscFunctionBegin;
193e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
194e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
195e6e9a74fSStefano Zampini   if (flg) {
196e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
19754da937aSStefano Zampini 
198e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
199e6e9a74fSStefano Zampini     cusp->transgen = transgen;
20054da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
20154da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
20254da937aSStefano Zampini     }
203e6e9a74fSStefano Zampini   }
204e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
205e6e9a74fSStefano Zampini }
206e6e9a74fSStefano Zampini 
2074416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2089ae82921SPaul Mullowney {
2099ae82921SPaul Mullowney   PetscErrorCode           ierr;
210e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2119ae82921SPaul Mullowney   PetscBool                flg;
212a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2136e111a19SKarl Rupp 
2149ae82921SPaul Mullowney   PetscFunctionBegin;
215e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2169ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
21754da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
21854da937aSStefano Zampini 
21954da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
22054da937aSStefano Zampini     if (flg) {
22154da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);
22254da937aSStefano Zampini     }
223e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
224a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
225e057df02SPaul Mullowney     if (flg) {
226e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
227045c96e1SPaul Mullowney     }
2289ae82921SPaul Mullowney   }
2294c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
230a183c035SDominic Meiser                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
2314c87dfd4SPaul Mullowney   if (flg) {
2324c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
2334c87dfd4SPaul Mullowney   }
2340af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2359ae82921SPaul Mullowney   PetscFunctionReturn(0);
2369ae82921SPaul Mullowney }
2379ae82921SPaul Mullowney 
2386fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2399ae82921SPaul Mullowney {
2409ae82921SPaul Mullowney   PetscErrorCode ierr;
2419ae82921SPaul Mullowney 
2429ae82921SPaul Mullowney   PetscFunctionBegin;
2439ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2449ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2459ae82921SPaul Mullowney   PetscFunctionReturn(0);
2469ae82921SPaul Mullowney }
2479ae82921SPaul Mullowney 
2486fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2499ae82921SPaul Mullowney {
2509ae82921SPaul Mullowney   PetscErrorCode ierr;
2519ae82921SPaul Mullowney 
2529ae82921SPaul Mullowney   PetscFunctionBegin;
2539ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2549ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2559ae82921SPaul Mullowney   PetscFunctionReturn(0);
2569ae82921SPaul Mullowney }
2579ae82921SPaul Mullowney 
258087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259087f3262SPaul Mullowney {
260087f3262SPaul Mullowney   PetscErrorCode ierr;
261087f3262SPaul Mullowney 
262087f3262SPaul Mullowney   PetscFunctionBegin;
263087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265087f3262SPaul Mullowney   PetscFunctionReturn(0);
266087f3262SPaul Mullowney }
267087f3262SPaul Mullowney 
268087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
269087f3262SPaul Mullowney {
270087f3262SPaul Mullowney   PetscErrorCode ierr;
271087f3262SPaul Mullowney 
272087f3262SPaul Mullowney   PetscFunctionBegin;
273087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
274087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
275087f3262SPaul Mullowney   PetscFunctionReturn(0);
276087f3262SPaul Mullowney }
277087f3262SPaul Mullowney 
278087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2799ae82921SPaul Mullowney {
2809ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2819ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2829ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
283aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2849ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2859ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2869ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2879ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2889ae82921SPaul Mullowney   PetscScalar                       *AALo;
2899ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
290b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
29157d48284SJunchao Zhang   cudaError_t                       cerr;
2929ae82921SPaul Mullowney 
2939ae82921SPaul Mullowney   PetscFunctionBegin;
294cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
295c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2969ae82921SPaul Mullowney     try {
2979ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2989ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2999ae82921SPaul Mullowney 
3009ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
30157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
30257d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
30357d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3049ae82921SPaul Mullowney 
3059ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
3069ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
3079ae82921SPaul Mullowney       AiLo[n]  = nzLower;
3089ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
3099ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
3109ae82921SPaul Mullowney       v        = aa;
3119ae82921SPaul Mullowney       vi       = aj;
3129ae82921SPaul Mullowney       offset   = 1;
3139ae82921SPaul Mullowney       rowOffset= 1;
3149ae82921SPaul Mullowney       for (i=1; i<n; i++) {
3159ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
316e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
3179ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
3189ae82921SPaul Mullowney         rowOffset += nz+1;
3199ae82921SPaul Mullowney 
320580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
321580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney         offset      += nz;
3249ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3259ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3269ae82921SPaul Mullowney         offset      += 1;
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney         v  += nz;
3299ae82921SPaul Mullowney         vi += nz;
3309ae82921SPaul Mullowney       }
3312205254eSKarl Rupp 
332aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
333aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3342205254eSKarl Rupp 
335aa372e3fSPaul Mullowney       /* Create the matrix description */
33657d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
33757d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
33857d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
33957d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
34057d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
341aa372e3fSPaul Mullowney 
342aa372e3fSPaul Mullowney       /* Create the solve analysis information */
34357d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
344aa372e3fSPaul Mullowney 
345aa372e3fSPaul Mullowney       /* set the operation */
346aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
347aa372e3fSPaul Mullowney 
348aa372e3fSPaul Mullowney       /* set the matrix */
349aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
350aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
351aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
352aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
353aa372e3fSPaul Mullowney 
354aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
355aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
356aa372e3fSPaul Mullowney 
357aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
358aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
359aa372e3fSPaul Mullowney 
360aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
361aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
362aa372e3fSPaul Mullowney 
363aa372e3fSPaul Mullowney       /* perform the solve analysis */
364aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
365aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
366aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
36757d48284SJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
368aa372e3fSPaul Mullowney 
369aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
370aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3712205254eSKarl Rupp 
37257d48284SJunchao Zhang       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
37357d48284SJunchao Zhang       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
37457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
3754863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
3769ae82921SPaul Mullowney     } catch(char *ex) {
3779ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3789ae82921SPaul Mullowney     }
3799ae82921SPaul Mullowney   }
3809ae82921SPaul Mullowney   PetscFunctionReturn(0);
3819ae82921SPaul Mullowney }
3829ae82921SPaul Mullowney 
383087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3849ae82921SPaul Mullowney {
3859ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3869ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3879ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
388aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3899ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3909ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3919ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3929ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3939ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3949ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3959ae82921SPaul Mullowney   PetscErrorCode                    ierr;
39657d48284SJunchao Zhang   cudaError_t                       cerr;
3979ae82921SPaul Mullowney 
3989ae82921SPaul Mullowney   PetscFunctionBegin;
399cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
400c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4019ae82921SPaul Mullowney     try {
4029ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4039ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
4049ae82921SPaul Mullowney 
4059ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
40657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
40757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
40857d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
4099ae82921SPaul Mullowney 
4109ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
4119ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
4129ae82921SPaul Mullowney       AiUp[n]=nzUpper;
4139ae82921SPaul Mullowney       offset = nzUpper;
4149ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
4159ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
4169ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
4179ae82921SPaul Mullowney 
418e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
4199ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
4209ae82921SPaul Mullowney 
421e057df02SPaul Mullowney         /* decrement the offset */
4229ae82921SPaul Mullowney         offset -= (nz+1);
4239ae82921SPaul Mullowney 
424e057df02SPaul Mullowney         /* first, set the diagonal elements */
4259ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
42609f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
4279ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4289ae82921SPaul Mullowney 
429580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
430580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
4319ae82921SPaul Mullowney       }
4322205254eSKarl Rupp 
433aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
434aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4352205254eSKarl Rupp 
436aa372e3fSPaul Mullowney       /* Create the matrix description */
43757d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
43857d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
43957d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
44057d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
44157d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
442aa372e3fSPaul Mullowney 
443aa372e3fSPaul Mullowney       /* Create the solve analysis information */
44457d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
445aa372e3fSPaul Mullowney 
446aa372e3fSPaul Mullowney       /* set the operation */
447aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
448aa372e3fSPaul Mullowney 
449aa372e3fSPaul Mullowney       /* set the matrix */
450aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
451aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
452aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
453aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
454aa372e3fSPaul Mullowney 
455aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
456aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
457aa372e3fSPaul Mullowney 
458aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
459aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
460aa372e3fSPaul Mullowney 
461aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
462aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
463aa372e3fSPaul Mullowney 
464aa372e3fSPaul Mullowney       /* perform the solve analysis */
465aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
466aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
467aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
46857d48284SJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
469aa372e3fSPaul Mullowney 
470aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
471aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4722205254eSKarl Rupp 
47357d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
47457d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
47557d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
4764863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
4779ae82921SPaul Mullowney     } catch(char *ex) {
4789ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4799ae82921SPaul Mullowney     }
4809ae82921SPaul Mullowney   }
4819ae82921SPaul Mullowney   PetscFunctionReturn(0);
4829ae82921SPaul Mullowney }
4839ae82921SPaul Mullowney 
484087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4859ae82921SPaul Mullowney {
4869ae82921SPaul Mullowney   PetscErrorCode               ierr;
4879ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4889ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4899ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4909ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4919ae82921SPaul Mullowney   const PetscInt               *r,*c;
4929ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4939ae82921SPaul Mullowney 
4949ae82921SPaul Mullowney   PetscFunctionBegin;
495ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
496087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
497087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4982205254eSKarl Rupp 
499e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
500aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
5019ae82921SPaul Mullowney 
502c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
503e057df02SPaul Mullowney   /* lower triangular indices */
5049ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5059ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5062205254eSKarl Rupp   if (!row_identity) {
507aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
508aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
5092205254eSKarl Rupp   }
5109ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5119ae82921SPaul Mullowney 
512e057df02SPaul Mullowney   /* upper triangular indices */
5139ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
5149ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5152205254eSKarl Rupp   if (!col_identity) {
516aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
517aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
5182205254eSKarl Rupp   }
5194863603aSSatish Balay 
5204863603aSSatish Balay   if (!row_identity && !col_identity) {
5214863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
5224863603aSSatish Balay   } else if(!row_identity) {
5234863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
5244863603aSSatish Balay   } else if(!col_identity) {
5254863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
5264863603aSSatish Balay   }
5274863603aSSatish Balay 
5289ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
5299ae82921SPaul Mullowney   PetscFunctionReturn(0);
5309ae82921SPaul Mullowney }
5319ae82921SPaul Mullowney 
532087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
533087f3262SPaul Mullowney {
534087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
535087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
536aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
537aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
538087f3262SPaul Mullowney   cusparseStatus_t                  stat;
539087f3262SPaul Mullowney   PetscErrorCode                    ierr;
54057d48284SJunchao Zhang   cudaError_t                       cerr;
541087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
542087f3262SPaul Mullowney   PetscScalar                       *AAUp;
543087f3262SPaul Mullowney   PetscScalar                       *AALo;
544087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
545087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
546087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
547087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
548087f3262SPaul Mullowney 
549087f3262SPaul Mullowney   PetscFunctionBegin;
550cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
551c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
552087f3262SPaul Mullowney     try {
553087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
55457d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
55557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
55657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
55757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
558087f3262SPaul Mullowney 
559087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
560087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
561087f3262SPaul Mullowney       AiUp[n]=nzUpper;
562087f3262SPaul Mullowney       offset = 0;
563087f3262SPaul Mullowney       for (i=0; i<n; i++) {
564087f3262SPaul Mullowney         /* set the pointers */
565087f3262SPaul Mullowney         v  = aa + ai[i];
566087f3262SPaul Mullowney         vj = aj + ai[i];
567087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
568087f3262SPaul Mullowney 
569087f3262SPaul Mullowney         /* first, set the diagonal elements */
570087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
57109f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
572087f3262SPaul Mullowney         AiUp[i]      = offset;
57309f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
574087f3262SPaul Mullowney 
575087f3262SPaul Mullowney         offset+=1;
576087f3262SPaul Mullowney         if (nz>0) {
577f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
578580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
579087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
580087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
581087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
582087f3262SPaul Mullowney           }
583087f3262SPaul Mullowney           offset+=nz;
584087f3262SPaul Mullowney         }
585087f3262SPaul Mullowney       }
586087f3262SPaul Mullowney 
587aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
588aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
589087f3262SPaul Mullowney 
590aa372e3fSPaul Mullowney       /* Create the matrix description */
59157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
59257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
59357d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
59457d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
59557d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
596087f3262SPaul Mullowney 
597aa372e3fSPaul Mullowney       /* Create the solve analysis information */
59857d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599aa372e3fSPaul Mullowney 
600aa372e3fSPaul Mullowney       /* set the operation */
601aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
602aa372e3fSPaul Mullowney 
603aa372e3fSPaul Mullowney       /* set the matrix */
604aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
605aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
606aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
607aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
608aa372e3fSPaul Mullowney 
609aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
610aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
611aa372e3fSPaul Mullowney 
612aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
613aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
614aa372e3fSPaul Mullowney 
615aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
616aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
617aa372e3fSPaul Mullowney 
618aa372e3fSPaul Mullowney       /* perform the solve analysis */
619aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
620aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
621aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
62257d48284SJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
623aa372e3fSPaul Mullowney 
624aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
625aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
626aa372e3fSPaul Mullowney 
627aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
628aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
629aa372e3fSPaul Mullowney 
630aa372e3fSPaul Mullowney       /* Create the matrix description */
63157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
63257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
63357d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
63457d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
63557d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
636aa372e3fSPaul Mullowney 
637aa372e3fSPaul Mullowney       /* Create the solve analysis information */
63857d48284SJunchao Zhang       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
639aa372e3fSPaul Mullowney 
640aa372e3fSPaul Mullowney       /* set the operation */
641aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
642aa372e3fSPaul Mullowney 
643aa372e3fSPaul Mullowney       /* set the matrix */
644aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
645aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
646aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
647aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
648aa372e3fSPaul Mullowney 
649aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
650aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
651aa372e3fSPaul Mullowney 
652aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
653aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
654aa372e3fSPaul Mullowney 
655aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
6574863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
658aa372e3fSPaul Mullowney 
659aa372e3fSPaul Mullowney       /* perform the solve analysis */
660aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
661aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
662aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
66357d48284SJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
664aa372e3fSPaul Mullowney 
665aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
666aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
667087f3262SPaul Mullowney 
668c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
66957d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
67057d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
67157d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
67257d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
673087f3262SPaul Mullowney     } catch(char *ex) {
674087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
675087f3262SPaul Mullowney     }
676087f3262SPaul Mullowney   }
677087f3262SPaul Mullowney   PetscFunctionReturn(0);
678087f3262SPaul Mullowney }
679087f3262SPaul Mullowney 
680087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6819ae82921SPaul Mullowney {
6829ae82921SPaul Mullowney   PetscErrorCode               ierr;
683087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
684087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
685087f3262SPaul Mullowney   IS                           ip = a->row;
686087f3262SPaul Mullowney   const PetscInt               *rip;
687087f3262SPaul Mullowney   PetscBool                    perm_identity;
688087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
689087f3262SPaul Mullowney 
690087f3262SPaul Mullowney   PetscFunctionBegin;
691ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
692087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
693e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
694aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
695aa372e3fSPaul Mullowney 
696087f3262SPaul Mullowney   /* lower triangular indices */
697087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
698087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
699087f3262SPaul Mullowney   if (!perm_identity) {
7004e4bbfaaSStefano Zampini     IS             iip;
7014e4bbfaaSStefano Zampini     const PetscInt *irip;
7024e4bbfaaSStefano Zampini 
7034e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
7044e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
705aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
706aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
707aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7084e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
7094e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
7104e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
7114863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
712087f3262SPaul Mullowney   }
713087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
714087f3262SPaul Mullowney   PetscFunctionReturn(0);
715087f3262SPaul Mullowney }
716087f3262SPaul Mullowney 
7176fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
7189ae82921SPaul Mullowney {
7199ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
7209ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
7219ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
722b175d8bbSPaul Mullowney   PetscErrorCode ierr;
7239ae82921SPaul Mullowney 
7249ae82921SPaul Mullowney   PetscFunctionBegin;
7259ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
726ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
727e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
7289ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
7299ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
730bda325fcSPaul Mullowney   if (row_identity && col_identity) {
731bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
732bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
7334e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7344e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
735bda325fcSPaul Mullowney   } else {
736bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
737bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
7384e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7394e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
740bda325fcSPaul Mullowney   }
7418dc1d2a3SPaul Mullowney 
742e057df02SPaul Mullowney   /* get the triangular factors */
743087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
7449ae82921SPaul Mullowney   PetscFunctionReturn(0);
7459ae82921SPaul Mullowney }
7469ae82921SPaul Mullowney 
747087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
748087f3262SPaul Mullowney {
749087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
750087f3262SPaul Mullowney   IS             ip = b->row;
751087f3262SPaul Mullowney   PetscBool      perm_identity;
752b175d8bbSPaul Mullowney   PetscErrorCode ierr;
753087f3262SPaul Mullowney 
754087f3262SPaul Mullowney   PetscFunctionBegin;
755087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
756ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
757087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
758087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
759087f3262SPaul Mullowney   if (perm_identity) {
760087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
761087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
7624e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7634e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
764087f3262SPaul Mullowney   } else {
765087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
766087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
7674e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
7684e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
769087f3262SPaul Mullowney   }
770087f3262SPaul Mullowney 
771087f3262SPaul Mullowney   /* get the triangular factors */
772087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
773087f3262SPaul Mullowney   PetscFunctionReturn(0);
774087f3262SPaul Mullowney }
7759ae82921SPaul Mullowney 
776b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
777bda325fcSPaul Mullowney {
778bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
779aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
780aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
781aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
782aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
783bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
784aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
785aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
786aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
787aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
788b175d8bbSPaul Mullowney 
789bda325fcSPaul Mullowney   PetscFunctionBegin;
790bda325fcSPaul Mullowney 
791aa372e3fSPaul Mullowney   /*********************************************/
792aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
793aa372e3fSPaul Mullowney   /*********************************************/
794aa372e3fSPaul Mullowney 
795aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
796aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
797aa372e3fSPaul Mullowney 
798aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
799aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
800aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
801aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
802aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
803aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
804aa372e3fSPaul Mullowney 
805aa372e3fSPaul Mullowney   /* Create the matrix description */
80657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
80757d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
80857d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
80957d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
81057d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
811aa372e3fSPaul Mullowney 
812aa372e3fSPaul Mullowney   /* Create the solve analysis information */
81357d48284SJunchao Zhang   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
814aa372e3fSPaul Mullowney 
815aa372e3fSPaul Mullowney   /* set the operation */
816aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
817aa372e3fSPaul Mullowney 
818aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
819aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
820aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
821aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
822aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
823aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
824aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
825aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
826aa372e3fSPaul Mullowney 
827aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
828aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
829aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
830aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
831aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
832aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
833aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
834aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
835aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
83657d48284SJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
837aa372e3fSPaul Mullowney 
838aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
839aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
840aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
841aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
842aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
84357d48284SJunchao Zhang                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
846aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
847aa372e3fSPaul Mullowney 
848aa372e3fSPaul Mullowney   /*********************************************/
849aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
850aa372e3fSPaul Mullowney   /*********************************************/
851aa372e3fSPaul Mullowney 
852aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
853aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
854aa372e3fSPaul Mullowney 
855aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
856aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
857aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
858aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
859aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
860aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
861aa372e3fSPaul Mullowney 
862aa372e3fSPaul Mullowney   /* Create the matrix description */
86357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
86457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
86557d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
86657d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
86757d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
868aa372e3fSPaul Mullowney 
869aa372e3fSPaul Mullowney   /* Create the solve analysis information */
87057d48284SJunchao Zhang   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
871aa372e3fSPaul Mullowney 
872aa372e3fSPaul Mullowney   /* set the operation */
873aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
874aa372e3fSPaul Mullowney 
875aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
876aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
877aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
878aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
879aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
880aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
881aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
882aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
883aa372e3fSPaul Mullowney 
884aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
885aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
886aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
887aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
888aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
889aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
890aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
891aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
892aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
89357d48284SJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
894aa372e3fSPaul Mullowney 
895aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
896aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
897aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
898aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
899aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
90057d48284SJunchao Zhang                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
903aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
904bda325fcSPaul Mullowney   PetscFunctionReturn(0);
905bda325fcSPaul Mullowney }
906bda325fcSPaul Mullowney 
907b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
908bda325fcSPaul Mullowney {
909aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
910aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
911aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
912bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
913bda325fcSPaul Mullowney   cusparseStatus_t             stat;
914aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
915b06137fdSPaul Mullowney   cudaError_t                  err;
91685ba7357SStefano Zampini   PetscErrorCode               ierr;
917b175d8bbSPaul Mullowney 
918bda325fcSPaul Mullowney   PetscFunctionBegin;
91985ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
92085ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
92185ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
92285ba7357SStefano Zampini   /* create cusparse matrix */
923aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
92457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
925aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
92657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
92757d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
928aa372e3fSPaul Mullowney 
929b06137fdSPaul Mullowney   /* set alpha and beta */
930c41cb2e2SAlejandro Lamas Daviña   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
9317656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
9327656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
9337656d835SStefano Zampini   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
9347656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
9357656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
93657d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
937b06137fdSPaul Mullowney 
938aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
939aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
940aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
941554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
942554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
943aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
944a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
945aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
946aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
947a3fdcf43SKarl Rupp 
94881902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
94981902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
95081902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
951aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
952a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
953a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
954aa372e3fSPaul Mullowney                             matrix->values->data().get(),
95581902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
956aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
957aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
958aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
959aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
96057d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
961aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
962aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
963aa372e3fSPaul Mullowney     CsrMatrix *temp = new CsrMatrix;
96451c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
96551c6d536SStefano Zampini 
96651c6d536SStefano Zampini     /* First convert HYB to CSR */
967aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
968aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
969aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
970aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
971aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
972aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
973aa372e3fSPaul Mullowney 
974aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
975aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
976aa372e3fSPaul Mullowney                             temp->values->data().get(),
977aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
97857d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
979aa372e3fSPaul Mullowney 
980aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
981aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
982aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
983aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
984aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
985aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
986aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
987aa372e3fSPaul Mullowney 
988aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
989aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
990aa372e3fSPaul Mullowney                             temp->values->data().get(),
991aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
992aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
993aa372e3fSPaul Mullowney                             tempT->values->data().get(),
994aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
995aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
99657d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
997aa372e3fSPaul Mullowney 
998aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
999aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
100057d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1001aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1002aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1003aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1004aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1005aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1006aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
100757d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1008aa372e3fSPaul Mullowney 
1009aa372e3fSPaul Mullowney     /* assign the pointer */
1010aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1011aa372e3fSPaul Mullowney     /* delete temporaries */
1012aa372e3fSPaul Mullowney     if (tempT) {
1013aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1014aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1015aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1016aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1017087f3262SPaul Mullowney     }
1018aa372e3fSPaul Mullowney     if (temp) {
1019aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1020aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1021aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1022aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1023aa372e3fSPaul Mullowney     }
1024aa372e3fSPaul Mullowney   }
102585ba7357SStefano Zampini   err  = WaitForGPU();CHKERRCUDA(err);
102685ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
102785ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1028213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1029213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1030aa372e3fSPaul Mullowney   /* assign the pointer */
1031aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1032bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1033bda325fcSPaul Mullowney }
1034bda325fcSPaul Mullowney 
10354e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
10366fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1037bda325fcSPaul Mullowney {
1038c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1039465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1040465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1041465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1042465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1043bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1044bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1045aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1048b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
104957d48284SJunchao Zhang   cudaError_t                           cerr;
1050bda325fcSPaul Mullowney 
1051bda325fcSPaul Mullowney   PetscFunctionBegin;
1052aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1053aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1054bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1055aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1056aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1057bda325fcSPaul Mullowney   }
1058bda325fcSPaul Mullowney 
1059bda325fcSPaul Mullowney   /* Get the GPU pointers */
1060c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1061c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1062c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1063c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1064bda325fcSPaul Mullowney 
10657a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1066aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1067c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1068c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1069c41cb2e2SAlejandro Lamas Daviña                xGPU);
1070aa372e3fSPaul Mullowney 
1071aa372e3fSPaul Mullowney   /* First, solve U */
1072aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
10737656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1074aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1075aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1076aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1077aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
107857d48284SJunchao Zhang                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1079aa372e3fSPaul Mullowney 
1080aa372e3fSPaul Mullowney   /* Then, solve L */
1081aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
10827656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1083aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1084aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1085aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1086aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
108757d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1090c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1091c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1092aa372e3fSPaul Mullowney                tempGPU->begin());
1093aa372e3fSPaul Mullowney 
1094aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1095c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1096bda325fcSPaul Mullowney 
1097bda325fcSPaul Mullowney   /* restore */
1098c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1099c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
110057d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1101661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1102958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1103bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1104bda325fcSPaul Mullowney }
1105bda325fcSPaul Mullowney 
11066fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1107bda325fcSPaul Mullowney {
1108465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1109465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1110bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1111bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1113aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1114aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
111657d48284SJunchao Zhang   cudaError_t                       cerr;
1117bda325fcSPaul Mullowney 
1118bda325fcSPaul Mullowney   PetscFunctionBegin;
1119aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1120aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1121bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1122aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1123aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1124bda325fcSPaul Mullowney   }
1125bda325fcSPaul Mullowney 
1126bda325fcSPaul Mullowney   /* Get the GPU pointers */
1127c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1128c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1129bda325fcSPaul Mullowney 
11307a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1131aa372e3fSPaul Mullowney   /* First, solve U */
1132aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
11337656d835SStefano Zampini                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1134aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1135aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1136aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1137aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
113857d48284SJunchao Zhang                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1139aa372e3fSPaul Mullowney 
1140aa372e3fSPaul Mullowney   /* Then, solve L */
1141aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
11427656d835SStefano Zampini                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1143aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1144aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1145aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1146aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
114757d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1148bda325fcSPaul Mullowney 
1149bda325fcSPaul Mullowney   /* restore */
1150c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1151c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
115257d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1153661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1154958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1155bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1156bda325fcSPaul Mullowney }
1157bda325fcSPaul Mullowney 
11586fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
11599ae82921SPaul Mullowney {
1160465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1161465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1162465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1163465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
11649ae82921SPaul Mullowney   cusparseStatus_t                      stat;
11659ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1166aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1167aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1168aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1169b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
117057d48284SJunchao Zhang   cudaError_t                           cerr;
11719ae82921SPaul Mullowney 
11729ae82921SPaul Mullowney   PetscFunctionBegin;
1173ebc8f436SDominic Meiser 
1174e057df02SPaul Mullowney   /* Get the GPU pointers */
1175c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1176c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1177c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1178c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
11799ae82921SPaul Mullowney 
11807a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1181aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1182c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1183c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
11844e4bbfaaSStefano Zampini                tempGPU->begin());
1185aa372e3fSPaul Mullowney 
1186aa372e3fSPaul Mullowney   /* Next, solve L */
1187aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
11887656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1189aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1190aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1191aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1192aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
119357d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1194aa372e3fSPaul Mullowney 
1195aa372e3fSPaul Mullowney   /* Then, solve U */
1196aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
11977656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1198aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1199aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1200aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1201aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
120257d48284SJunchao Zhang                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1203aa372e3fSPaul Mullowney 
12044e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
12054e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
12064e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
12074e4bbfaaSStefano Zampini                xGPU);
12089ae82921SPaul Mullowney 
1209c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1210c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
121157d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1212661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1213958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12149ae82921SPaul Mullowney   PetscFunctionReturn(0);
12159ae82921SPaul Mullowney }
12169ae82921SPaul Mullowney 
12176fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
12189ae82921SPaul Mullowney {
1219465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1220465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
12219ae82921SPaul Mullowney   cusparseStatus_t                  stat;
12229ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1223aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1224aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1225aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1226b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
122757d48284SJunchao Zhang   cudaError_t                       cerr;
12289ae82921SPaul Mullowney 
12299ae82921SPaul Mullowney   PetscFunctionBegin;
1230e057df02SPaul Mullowney   /* Get the GPU pointers */
1231c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1232c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
12339ae82921SPaul Mullowney 
12347a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1235aa372e3fSPaul Mullowney   /* First, solve L */
1236aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
12377656d835SStefano Zampini                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1238aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1239aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1240aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1241aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
124257d48284SJunchao Zhang                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1243aa372e3fSPaul Mullowney 
1244aa372e3fSPaul Mullowney   /* Next, solve U */
1245aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
12467656d835SStefano Zampini                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1247aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1248aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1249aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1250aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
125157d48284SJunchao Zhang                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
12529ae82921SPaul Mullowney 
1253c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1254c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
125557d48284SJunchao Zhang   cerr = WaitForGPU();CHKERRCUDA(cerr);
1256661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1257958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12589ae82921SPaul Mullowney   PetscFunctionReturn(0);
12599ae82921SPaul Mullowney }
12609ae82921SPaul Mullowney 
12616fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
12629ae82921SPaul Mullowney {
1263aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
12647c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
12659ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1266213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
12679ae82921SPaul Mullowney   PetscErrorCode               ierr;
1268aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1269b06137fdSPaul Mullowney   cudaError_t                  err;
12709ae82921SPaul Mullowney 
12719ae82921SPaul Mullowney   PetscFunctionBegin;
127295639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1273c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
127481902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
127581902715SJunchao Zhang       /* Copy values only */
127681902715SJunchao Zhang       CsrMatrix *mat,*matT;
127781902715SJunchao Zhang       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
127885ba7357SStefano Zampini 
127985ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
128085ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
128181902715SJunchao Zhang       mat->values->assign(a->a, a->a+a->nz);
128285ba7357SStefano Zampini       err  = WaitForGPU();CHKERRCUDA(err);
128385ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
12844863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
128585ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
128681902715SJunchao Zhang 
128781902715SJunchao Zhang       /* Update matT when it was built before */
128881902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
128981902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
129081902715SJunchao Zhang         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
129185ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
129285ba7357SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
129381902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
129481902715SJunchao Zhang                                  A->cmap->n, mat->num_entries,
129581902715SJunchao Zhang                                  mat->values->data().get(),
129681902715SJunchao Zhang                                  cusparsestruct->rowoffsets_gpu->data().get(),
129781902715SJunchao Zhang                                  mat->column_indices->data().get(),
129881902715SJunchao Zhang                                  matT->values->data().get(),
129981902715SJunchao Zhang                                  matT->column_indices->data().get(),
130081902715SJunchao Zhang                                  matT->row_offsets->data().get(),
13019f74ec24SSatish Balay                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
130285ba7357SStefano Zampini         err  = WaitForGPU();CHKERRCUDA(err);
130385ba7357SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
130485ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
130581902715SJunchao Zhang       }
130634d6c7a5SJose E. Roman     } else {
130785ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
130885ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
13097c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
13107c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
13117c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
131281902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
13139ae82921SPaul Mullowney       try {
13149ae82921SPaul Mullowney         if (a->compressedrow.use) {
13159ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
13169ae82921SPaul Mullowney           ii   = a->compressedrow.i;
13179ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
13189ae82921SPaul Mullowney         } else {
1319213423ffSJunchao Zhang           m    = A->rmap->n;
1320213423ffSJunchao Zhang           ii   = a->i;
1321e6e9a74fSStefano Zampini           ridx = NULL;
13229ae82921SPaul Mullowney         }
1323213423ffSJunchao Zhang         cusparsestruct->nrows = m;
13249ae82921SPaul Mullowney 
132585ba7357SStefano Zampini         /* create cusparse matrix */
1326aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
132757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
132857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
132957d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
13309ae82921SPaul Mullowney 
1331c41cb2e2SAlejandro Lamas Daviña         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
13327656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
13337656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
13347656d835SStefano Zampini         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13357656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13367656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
133757d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1338b06137fdSPaul Mullowney 
1339aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1340aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1341aa372e3fSPaul Mullowney           /* set the matrix */
1342aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1343a65300a6SPaul Mullowney           matrix->num_rows = m;
1344aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1345aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1346a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1347a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
13489ae82921SPaul Mullowney 
1349aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1350aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1351aa372e3fSPaul Mullowney 
1352aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1353aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1354aa372e3fSPaul Mullowney 
1355aa372e3fSPaul Mullowney           /* assign the pointer */
1356aa372e3fSPaul Mullowney           matstruct->mat = matrix;
1357aa372e3fSPaul Mullowney 
1358aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1359aa372e3fSPaul Mullowney           CsrMatrix *matrix= new CsrMatrix;
1360a65300a6SPaul Mullowney           matrix->num_rows = m;
1361aa372e3fSPaul Mullowney           matrix->num_cols = A->cmap->n;
1362aa372e3fSPaul Mullowney           matrix->num_entries = a->nz;
1363a65300a6SPaul Mullowney           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1364a65300a6SPaul Mullowney           matrix->row_offsets->assign(ii, ii + m+1);
1365aa372e3fSPaul Mullowney 
1366aa372e3fSPaul Mullowney           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1367aa372e3fSPaul Mullowney           matrix->column_indices->assign(a->j, a->j+a->nz);
1368aa372e3fSPaul Mullowney 
1369aa372e3fSPaul Mullowney           matrix->values = new THRUSTARRAY(a->nz);
1370aa372e3fSPaul Mullowney           matrix->values->assign(a->a, a->a+a->nz);
1371aa372e3fSPaul Mullowney 
1372aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
137357d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1374aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1375aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1376a65300a6SPaul Mullowney           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1377aa372e3fSPaul Mullowney               matstruct->descr, matrix->values->data().get(),
1378aa372e3fSPaul Mullowney               matrix->row_offsets->data().get(),
1379aa372e3fSPaul Mullowney               matrix->column_indices->data().get(),
138057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1381aa372e3fSPaul Mullowney           /* assign the pointer */
1382aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1383aa372e3fSPaul Mullowney 
1384aa372e3fSPaul Mullowney           if (matrix) {
1385aa372e3fSPaul Mullowney             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1386aa372e3fSPaul Mullowney             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1387aa372e3fSPaul Mullowney             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1388aa372e3fSPaul Mullowney             delete (CsrMatrix*)matrix;
1389087f3262SPaul Mullowney           }
1390087f3262SPaul Mullowney         }
1391ca45077fSPaul Mullowney 
1392aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1393213423ffSJunchao Zhang         if (a->compressedrow.use) {
1394213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1395aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1396aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1397213423ffSJunchao Zhang           tmp = m;
1398213423ffSJunchao Zhang         } else {
1399213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1400213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1401213423ffSJunchao Zhang           tmp = 0;
1402213423ffSJunchao Zhang         }
1403213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1404aa372e3fSPaul Mullowney 
1405aa372e3fSPaul Mullowney         /* assign the pointer */
1406aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
14079ae82921SPaul Mullowney       } catch(char *ex) {
14089ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
14099ae82921SPaul Mullowney       }
141085ba7357SStefano Zampini       err  = WaitForGPU();CHKERRCUDA(err);
141185ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
141285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
141334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
141434d6c7a5SJose E. Roman     }
1415c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
14169ae82921SPaul Mullowney   }
14179ae82921SPaul Mullowney   PetscFunctionReturn(0);
14189ae82921SPaul Mullowney }
14199ae82921SPaul Mullowney 
1420c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1421aa372e3fSPaul Mullowney {
1422aa372e3fSPaul Mullowney   template <typename Tuple>
1423aa372e3fSPaul Mullowney   __host__ __device__
1424aa372e3fSPaul Mullowney   void operator()(Tuple t)
1425aa372e3fSPaul Mullowney   {
1426aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1427aa372e3fSPaul Mullowney   }
1428aa372e3fSPaul Mullowney };
1429aa372e3fSPaul Mullowney 
1430e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1431e6e9a74fSStefano Zampini {
1432e6e9a74fSStefano Zampini   template <typename Tuple>
1433e6e9a74fSStefano Zampini   __host__ __device__
1434e6e9a74fSStefano Zampini   void operator()(Tuple t)
1435e6e9a74fSStefano Zampini   {
1436e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1437e6e9a74fSStefano Zampini   }
1438e6e9a74fSStefano Zampini };
1439e6e9a74fSStefano Zampini 
1440ccdfe979SStefano Zampini typedef struct {
1441ccdfe979SStefano Zampini   PetscBool   cisdense;
1442ccdfe979SStefano Zampini   PetscScalar *Bt;
1443ccdfe979SStefano Zampini   Mat         X;
1444ccdfe979SStefano Zampini } MatMatCusparse;
1445ccdfe979SStefano Zampini 
1446ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1447ccdfe979SStefano Zampini {
1448ccdfe979SStefano Zampini   PetscErrorCode ierr;
1449ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1450ccdfe979SStefano Zampini   cudaError_t    cerr;
1451ccdfe979SStefano Zampini 
1452ccdfe979SStefano Zampini   PetscFunctionBegin;
1453ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1454ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1455ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1456ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1457ccdfe979SStefano Zampini }
1458ccdfe979SStefano Zampini 
1459ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1460ccdfe979SStefano Zampini 
1461ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1462ccdfe979SStefano Zampini {
1463ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1464ccdfe979SStefano Zampini   Mat                          A,B;
1465ccdfe979SStefano Zampini   PetscInt                     m,n,k,blda,clda;
1466ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1467ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1468ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1469ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1470ccdfe979SStefano Zampini   cusparseMatDescr_t           matA;
1471ccdfe979SStefano Zampini   const PetscScalar            *barray;
1472ccdfe979SStefano Zampini   PetscScalar                  *carray;
1473ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1474ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1475ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1476ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1477c8378d12SStefano Zampini   cudaError_t                  cuer;
1478ccdfe979SStefano Zampini 
1479ccdfe979SStefano Zampini   PetscFunctionBegin;
1480ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1481ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1482ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1483ccdfe979SStefano Zampini   A    = product->A;
1484ccdfe979SStefano Zampini   B    = product->B;
1485ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1486ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1487ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1488ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1489ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1490ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1491ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1492ccdfe979SStefano Zampini   switch (product->type) {
1493ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1494ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1495ccdfe979SStefano Zampini     mat = cusp->mat;
1496ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1497ccdfe979SStefano Zampini     m   = A->rmap->n;
1498ccdfe979SStefano Zampini     k   = A->cmap->n;
1499ccdfe979SStefano Zampini     n   = B->cmap->n;
1500ccdfe979SStefano Zampini     break;
1501ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1502e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1503e6e9a74fSStefano Zampini       mat = cusp->mat;
1504e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1505e6e9a74fSStefano Zampini     } else {
1506ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1507ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1508ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1509e6e9a74fSStefano Zampini     }
1510ccdfe979SStefano Zampini     m = A->cmap->n;
1511ccdfe979SStefano Zampini     k = A->rmap->n;
1512ccdfe979SStefano Zampini     n = B->cmap->n;
1513ccdfe979SStefano Zampini     break;
1514ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1515ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1516ccdfe979SStefano Zampini     mat = cusp->mat;
1517ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1518ccdfe979SStefano Zampini     m   = A->rmap->n;
1519ccdfe979SStefano Zampini     k   = B->cmap->n;
1520ccdfe979SStefano Zampini     n   = B->rmap->n;
1521ccdfe979SStefano Zampini     break;
1522ccdfe979SStefano Zampini   default:
1523ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1524ccdfe979SStefano Zampini   }
1525ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1526ccdfe979SStefano Zampini   matA   = mat->descr;
1527ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1528ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1529ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1530ccdfe979SStefano Zampini   if (!biscuda) {
1531ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1532ccdfe979SStefano Zampini   }
1533ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1534ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1535c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1536c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1537c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1538c8378d12SStefano Zampini   } else {
1539c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1540c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1541c8378d12SStefano Zampini   }
1542c8378d12SStefano Zampini 
1543c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1544c8378d12SStefano Zampini 
1545ccdfe979SStefano Zampini   /* cusparse spmm does not support transpose on B */
1546ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1547ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
1548ccdfe979SStefano Zampini     cublasStatus_t cerr;
1549ccdfe979SStefano Zampini 
1550ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1551ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1552ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
1553ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
1554ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
1555ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1556ccdfe979SStefano Zampini     blda = B->cmap->n;
1557ccdfe979SStefano Zampini   }
1558ccdfe979SStefano Zampini 
1559ccdfe979SStefano Zampini   /* perform the MatMat operation */
1560ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1561ccdfe979SStefano Zampini                            csrmat->num_entries,mat->alpha,matA,
1562ccdfe979SStefano Zampini                            csrmat->values->data().get(),
1563ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
1564ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
1565ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1566ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
1567c8378d12SStefano Zampini   cuer = WaitForGPU();CHKERRCUDA(cuer);
1568c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1570ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1571ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
1572ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1573ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1574ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
1575ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1576ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1577ccdfe979SStefano Zampini   } else {
1578ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1579ccdfe979SStefano Zampini   }
1580ccdfe979SStefano Zampini   if (mmdata->cisdense) {
1581ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1582ccdfe979SStefano Zampini   }
1583ccdfe979SStefano Zampini   if (!biscuda) {
1584ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1585ccdfe979SStefano Zampini   }
1586ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1587ccdfe979SStefano Zampini }
1588ccdfe979SStefano Zampini 
1589ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1590ccdfe979SStefano Zampini {
1591ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
1592ccdfe979SStefano Zampini   Mat                A,B;
1593ccdfe979SStefano Zampini   PetscInt           m,n;
1594ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
1595ccdfe979SStefano Zampini   PetscErrorCode     ierr;
1596ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
1597ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
1598ccdfe979SStefano Zampini 
1599ccdfe979SStefano Zampini   PetscFunctionBegin;
1600ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1601ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1602ccdfe979SStefano Zampini   A    = product->A;
1603ccdfe979SStefano Zampini   B    = product->B;
1604ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1605ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1606ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1607ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1608ccdfe979SStefano Zampini   switch (product->type) {
1609ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1610ccdfe979SStefano Zampini     m = A->rmap->n;
1611ccdfe979SStefano Zampini     n = B->cmap->n;
1612ccdfe979SStefano Zampini     break;
1613ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1614ccdfe979SStefano Zampini     m = A->cmap->n;
1615ccdfe979SStefano Zampini     n = B->cmap->n;
1616ccdfe979SStefano Zampini     break;
1617ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1618ccdfe979SStefano Zampini     m = A->rmap->n;
1619ccdfe979SStefano Zampini     n = B->rmap->n;
1620ccdfe979SStefano Zampini     break;
1621ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1622ccdfe979SStefano Zampini     m = B->cmap->n;
1623ccdfe979SStefano Zampini     n = B->cmap->n;
1624ccdfe979SStefano Zampini     break;
1625ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1626ccdfe979SStefano Zampini     m = B->rmap->n;
1627ccdfe979SStefano Zampini     n = B->rmap->n;
1628ccdfe979SStefano Zampini     break;
1629ccdfe979SStefano Zampini   default:
1630ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1631ccdfe979SStefano Zampini   }
1632ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1633ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1634ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1635ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1636ccdfe979SStefano Zampini 
1637ccdfe979SStefano Zampini   /* product data */
1638ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1639ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
1640ccdfe979SStefano Zampini   /* cusparse spmm does not support transpose on B */
1641ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1642ccdfe979SStefano Zampini     cudaError_t cerr;
1643ccdfe979SStefano Zampini 
1644ccdfe979SStefano Zampini     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1645ccdfe979SStefano Zampini   }
1646ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
1647ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1648ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1649ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1650ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1651ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1652ccdfe979SStefano Zampini     } else {
1653ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1654ccdfe979SStefano Zampini     }
1655ccdfe979SStefano Zampini   }
1656ccdfe979SStefano Zampini   C->product->data    = mmdata;
1657ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
1658ccdfe979SStefano Zampini 
1659ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1660ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1661ccdfe979SStefano Zampini }
1662ccdfe979SStefano Zampini 
1663ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1664ccdfe979SStefano Zampini 
1665ccdfe979SStefano Zampini /* handles dense B */
1666ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1667ccdfe979SStefano Zampini {
1668ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
1669ccdfe979SStefano Zampini   PetscErrorCode ierr;
1670ccdfe979SStefano Zampini 
1671ccdfe979SStefano Zampini   PetscFunctionBegin;
1672ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1673ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1674ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
1675ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1676ccdfe979SStefano Zampini     PetscFunctionReturn(0);
1677ccdfe979SStefano Zampini   }
1678ccdfe979SStefano Zampini   switch (product->type) {
1679ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1680ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1681ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1682ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1683ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1684ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1685ccdfe979SStefano Zampini   default:
1686ccdfe979SStefano Zampini     break;
1687ccdfe979SStefano Zampini   }
1688ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1689ccdfe979SStefano Zampini }
1690ccdfe979SStefano Zampini 
16916fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
16929ae82921SPaul Mullowney {
1693b175d8bbSPaul Mullowney   PetscErrorCode ierr;
16949ae82921SPaul Mullowney 
16959ae82921SPaul Mullowney   PetscFunctionBegin;
1696e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1697e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1698e6e9a74fSStefano Zampini }
1699e6e9a74fSStefano Zampini 
1700e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1701e6e9a74fSStefano Zampini {
1702e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1703e6e9a74fSStefano Zampini 
1704e6e9a74fSStefano Zampini   PetscFunctionBegin;
1705e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1706e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1707e6e9a74fSStefano Zampini }
1708e6e9a74fSStefano Zampini 
1709e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1710e6e9a74fSStefano Zampini {
1711e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1712e6e9a74fSStefano Zampini 
1713e6e9a74fSStefano Zampini   PetscFunctionBegin;
1714e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1715e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
1716e6e9a74fSStefano Zampini }
1717e6e9a74fSStefano Zampini 
1718e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1719e6e9a74fSStefano Zampini {
1720e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1721e6e9a74fSStefano Zampini 
1722e6e9a74fSStefano Zampini   PetscFunctionBegin;
1723e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
17249ae82921SPaul Mullowney   PetscFunctionReturn(0);
17259ae82921SPaul Mullowney }
17269ae82921SPaul Mullowney 
17276fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1728ca45077fSPaul Mullowney {
1729b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1730ca45077fSPaul Mullowney 
1731ca45077fSPaul Mullowney   PetscFunctionBegin;
1732e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1733ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1734ca45077fSPaul Mullowney }
1735ca45077fSPaul Mullowney 
1736e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
17379ae82921SPaul Mullowney {
17389ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1739aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17409ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1741e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1742b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
174357d48284SJunchao Zhang   cudaError_t                  cerr;
1744aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1745e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1746e6e9a74fSStefano Zampini   PetscBool                    compressed;
17476e111a19SKarl Rupp 
17489ae82921SPaul Mullowney   PetscFunctionBegin;
1749e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1750e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
1751e6e9a74fSStefano Zampini     if (!yy) {
1752e6e9a74fSStefano Zampini       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1753e6e9a74fSStefano Zampini     }
1754e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
1755e6e9a74fSStefano Zampini   }
175634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
175734d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1758e6e9a74fSStefano Zampini   if (!trans) {
17599ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1760e6e9a74fSStefano Zampini   } else {
1761e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
1762e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1763e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1764e6e9a74fSStefano Zampini     } else {
1765e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1766e6e9a74fSStefano Zampini       if (!matstruct) {
1767e6e9a74fSStefano Zampini         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1768e6e9a74fSStefano Zampini         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1769e6e9a74fSStefano Zampini       }
1770e6e9a74fSStefano Zampini     }
1771e6e9a74fSStefano Zampini   }
1772e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1773e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1774213423ffSJunchao Zhang 
1775e6e9a74fSStefano Zampini   try {
1776e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1777213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1778213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
177985ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1780e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1781e6e9a74fSStefano Zampini       xptr = xarray;
1782e6e9a74fSStefano Zampini       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1783213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1784e6e9a74fSStefano Zampini     } else {
1785e6e9a74fSStefano Zampini       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1786e6e9a74fSStefano Zampini       dptr = zarray;
1787e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1788e6e9a74fSStefano Zampini 
1789e6e9a74fSStefano Zampini       if (compressed) {
1790e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1791e6e9a74fSStefano Zampini 
1792e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1793e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1794e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
1795e6e9a74fSStefano Zampini       }
1796e6e9a74fSStefano Zampini     }
17979ae82921SPaul Mullowney 
1798213423ffSJunchao Zhang     /* csr_spmv does y = alpha*Ax + beta*y */
1799aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
18007656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1801e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1802a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1803b06137fdSPaul Mullowney                                mat->num_entries, matstruct->alpha, matstruct->descr,
1804aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1805e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
180657d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
1807aa372e3fSPaul Mullowney     } else {
1808213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
1809301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1810e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1811b06137fdSPaul Mullowney                                  matstruct->alpha, matstruct->descr, hybMat,
1812e6e9a74fSStefano Zampini                                  xptr, beta,
181357d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
1814a65300a6SPaul Mullowney       }
1815aa372e3fSPaul Mullowney     }
1816e6e9a74fSStefano Zampini     cerr = WaitForGPU();CHKERRCUDA(cerr);
1817958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1818aa372e3fSPaul Mullowney 
1819e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1820213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1821213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1822213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1823e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1824213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
18257656d835SStefano Zampini         }
1826213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1827c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
18287656d835SStefano Zampini       }
18297656d835SStefano Zampini 
1830213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1831213423ffSJunchao Zhang       if (compressed) {
1832213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1833e6e9a74fSStefano Zampini 
1834e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1835c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1836e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1837c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
1838e6e9a74fSStefano Zampini         cerr = WaitForGPU();CHKERRCUDA(cerr);
1839958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1840e6e9a74fSStefano Zampini       }
1841e6e9a74fSStefano Zampini     } else {
1842e6e9a74fSStefano Zampini       if (yy && yy != zz) {
1843e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1844e6e9a74fSStefano Zampini       }
1845e6e9a74fSStefano Zampini     }
1846e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1847213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1848213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
18499ae82921SPaul Mullowney   } catch(char *ex) {
18509ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18519ae82921SPaul Mullowney   }
1852e6e9a74fSStefano Zampini   if (yy) {
1853958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1854e6e9a74fSStefano Zampini   } else {
1855e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1856e6e9a74fSStefano Zampini   }
18579ae82921SPaul Mullowney   PetscFunctionReturn(0);
18589ae82921SPaul Mullowney }
18599ae82921SPaul Mullowney 
18606fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1861ca45077fSPaul Mullowney {
1862b175d8bbSPaul Mullowney   PetscErrorCode ierr;
18636e111a19SKarl Rupp 
1864ca45077fSPaul Mullowney   PetscFunctionBegin;
1865e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1866ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1867ca45077fSPaul Mullowney }
1868ca45077fSPaul Mullowney 
18696fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
18709ae82921SPaul Mullowney {
18719ae82921SPaul Mullowney   PetscErrorCode ierr;
18726e111a19SKarl Rupp 
18739ae82921SPaul Mullowney   PetscFunctionBegin;
18749ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
187595639643SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1876bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1877e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1878bc3f50f2SPaul Mullowney   }
18799ae82921SPaul Mullowney   PetscFunctionReturn(0);
18809ae82921SPaul Mullowney }
18819ae82921SPaul Mullowney 
18829ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
1883e057df02SPaul Mullowney /*@
18849ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1885e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1886e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1887e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1888e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1889e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
18909ae82921SPaul Mullowney 
1891d083f849SBarry Smith    Collective
18929ae82921SPaul Mullowney 
18939ae82921SPaul Mullowney    Input Parameters:
18949ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
18959ae82921SPaul Mullowney .  m - number of rows
18969ae82921SPaul Mullowney .  n - number of columns
18979ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
18989ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
18990298fd71SBarry Smith          (possibly different for each row) or NULL
19009ae82921SPaul Mullowney 
19019ae82921SPaul Mullowney    Output Parameter:
19029ae82921SPaul Mullowney .  A - the matrix
19039ae82921SPaul Mullowney 
19049ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
19059ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
19069ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
19079ae82921SPaul Mullowney 
19089ae82921SPaul Mullowney    Notes:
19099ae82921SPaul Mullowney    If nnz is given then nz is ignored
19109ae82921SPaul Mullowney 
19119ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
19129ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
19139ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
19149ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
19159ae82921SPaul Mullowney 
19169ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
19170298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
19189ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
19199ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
19209ae82921SPaul Mullowney 
19219ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
19229ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
19239ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
19249ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
19259ae82921SPaul Mullowney 
19269ae82921SPaul Mullowney    Level: intermediate
19279ae82921SPaul Mullowney 
1928e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
19299ae82921SPaul Mullowney @*/
19309ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
19319ae82921SPaul Mullowney {
19329ae82921SPaul Mullowney   PetscErrorCode ierr;
19339ae82921SPaul Mullowney 
19349ae82921SPaul Mullowney   PetscFunctionBegin;
19359ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
19369ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
19379ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
19389ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
19399ae82921SPaul Mullowney   PetscFunctionReturn(0);
19409ae82921SPaul Mullowney }
19419ae82921SPaul Mullowney 
19426fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
19439ae82921SPaul Mullowney {
19449ae82921SPaul Mullowney   PetscErrorCode ierr;
1945ab25e6cbSDominic Meiser 
19469ae82921SPaul Mullowney   PetscFunctionBegin;
19479ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1948470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
19499ae82921SPaul Mullowney   } else {
1950470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1951aa372e3fSPaul Mullowney   }
1952ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1953ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1954ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1955ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
19569ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
19579ae82921SPaul Mullowney   PetscFunctionReturn(0);
19589ae82921SPaul Mullowney }
19599ae82921SPaul Mullowney 
1960ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
196195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
19629ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
19639ff858a8SKarl Rupp {
19649ff858a8SKarl Rupp   PetscErrorCode ierr;
19659ff858a8SKarl Rupp 
19669ff858a8SKarl Rupp   PetscFunctionBegin;
19679ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1968ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
19699ff858a8SKarl Rupp   PetscFunctionReturn(0);
19709ff858a8SKarl Rupp }
19719ff858a8SKarl Rupp 
197295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
197395639643SRichard Tran Mills {
1974e6e9a74fSStefano Zampini   PetscErrorCode ierr;
1975e6e9a74fSStefano Zampini 
197695639643SRichard Tran Mills   PetscFunctionBegin;
1977e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1978c34f1ff0SRichard Tran Mills   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
197995639643SRichard Tran Mills      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1980e6e9a74fSStefano Zampini      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
198195639643SRichard Tran Mills      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
198295639643SRichard Tran Mills            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1983e6e9a74fSStefano Zampini   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1984e6e9a74fSStefano Zampini   /* TODO: add support for this? */
198595639643SRichard Tran Mills   if (flg) {
198695639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
198795639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1988c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1989c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1990e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
1991e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
1992e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1993e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
199495639643SRichard Tran Mills   } else {
199595639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
199695639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
199795639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
199895639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1999e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2000e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2001e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2002e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
200395639643SRichard Tran Mills   }
200495639643SRichard Tran Mills   A->boundtocpu = flg;
200595639643SRichard Tran Mills   PetscFunctionReturn(0);
200695639643SRichard Tran Mills }
200795639643SRichard Tran Mills 
200849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
20099ae82921SPaul Mullowney {
20109ae82921SPaul Mullowney   PetscErrorCode   ierr;
2011aa372e3fSPaul Mullowney   cusparseStatus_t stat;
201249735bf3SStefano Zampini   Mat              B;
20139ae82921SPaul Mullowney 
20149ae82921SPaul Mullowney   PetscFunctionBegin;
201549735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
201649735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
201749735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
201849735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
201949735bf3SStefano Zampini   }
202049735bf3SStefano Zampini   B = *newmat;
202149735bf3SStefano Zampini 
202234136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
202334136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
202434136279SStefano Zampini 
202549735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
20269ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2027e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2028e6e9a74fSStefano Zampini 
2029e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2030e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2031e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2032e6e9a74fSStefano Zampini       B->spptr = spptr;
20339ae82921SPaul Mullowney     } else {
2034e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2035e6e9a74fSStefano Zampini 
2036e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2037e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2038e6e9a74fSStefano Zampini       B->spptr = spptr;
20399ae82921SPaul Mullowney     }
2040e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
204149735bf3SStefano Zampini   }
2042693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
20439ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
20449ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
204595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2046693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
20472205254eSKarl Rupp 
2048e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
20499ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2050bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
20519ae82921SPaul Mullowney   PetscFunctionReturn(0);
20529ae82921SPaul Mullowney }
20539ae82921SPaul Mullowney 
205402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
205502fe1965SBarry Smith {
205602fe1965SBarry Smith   PetscErrorCode ierr;
205702fe1965SBarry Smith 
205802fe1965SBarry Smith   PetscFunctionBegin;
205902fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
20600ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
206102fe1965SBarry Smith   PetscFunctionReturn(0);
206202fe1965SBarry Smith }
206302fe1965SBarry Smith 
20643ca39a21SBarry Smith /*MC
2065e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2066e057df02SPaul Mullowney 
2067e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
20682692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
20692692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2070e057df02SPaul Mullowney 
2071e057df02SPaul Mullowney    Options Database Keys:
2072e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2073aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2074a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2075e057df02SPaul Mullowney 
2076e057df02SPaul Mullowney   Level: beginner
2077e057df02SPaul Mullowney 
20788468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2079e057df02SPaul Mullowney M*/
20807f756511SDominic Meiser 
208142c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
208242c9c57cSBarry Smith 
20830f39cd5aSBarry Smith 
20843ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
208542c9c57cSBarry Smith {
208642c9c57cSBarry Smith   PetscErrorCode ierr;
208742c9c57cSBarry Smith 
208842c9c57cSBarry Smith   PetscFunctionBegin;
20893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20903ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20913ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
20923ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
209342c9c57cSBarry Smith   PetscFunctionReturn(0);
209442c9c57cSBarry Smith }
209529b38603SBarry Smith 
2096470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
20977f756511SDominic Meiser {
2098e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
20997f756511SDominic Meiser   cusparseStatus_t stat;
21007f756511SDominic Meiser   cusparseHandle_t handle;
21017f756511SDominic Meiser 
21027f756511SDominic Meiser   PetscFunctionBegin;
21037f756511SDominic Meiser   if (*cusparsestruct) {
2104e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2105e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
21067f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
210781902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
21087f756511SDominic Meiser     if (handle = (*cusparsestruct)->handle) {
210957d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
21107f756511SDominic Meiser     }
2111e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
21127f756511SDominic Meiser   }
21137f756511SDominic Meiser   PetscFunctionReturn(0);
21147f756511SDominic Meiser }
21157f756511SDominic Meiser 
21167f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
21177f756511SDominic Meiser {
21187f756511SDominic Meiser   PetscFunctionBegin;
21197f756511SDominic Meiser   if (*mat) {
21207f756511SDominic Meiser     delete (*mat)->values;
21217f756511SDominic Meiser     delete (*mat)->column_indices;
21227f756511SDominic Meiser     delete (*mat)->row_offsets;
21237f756511SDominic Meiser     delete *mat;
21247f756511SDominic Meiser     *mat = 0;
21257f756511SDominic Meiser   }
21267f756511SDominic Meiser   PetscFunctionReturn(0);
21277f756511SDominic Meiser }
21287f756511SDominic Meiser 
2129470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
21307f756511SDominic Meiser {
21317f756511SDominic Meiser   cusparseStatus_t stat;
21327f756511SDominic Meiser   PetscErrorCode   ierr;
21337f756511SDominic Meiser 
21347f756511SDominic Meiser   PetscFunctionBegin;
21357f756511SDominic Meiser   if (*trifactor) {
213657d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
213757d48284SJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
21387f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
21397f756511SDominic Meiser     delete *trifactor;
21407f756511SDominic Meiser     *trifactor = 0;
21417f756511SDominic Meiser   }
21427f756511SDominic Meiser   PetscFunctionReturn(0);
21437f756511SDominic Meiser }
21447f756511SDominic Meiser 
2145470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
21467f756511SDominic Meiser {
21477f756511SDominic Meiser   CsrMatrix        *mat;
21487f756511SDominic Meiser   cusparseStatus_t stat;
21497f756511SDominic Meiser   cudaError_t      err;
21507f756511SDominic Meiser 
21517f756511SDominic Meiser   PetscFunctionBegin;
21527f756511SDominic Meiser   if (*matstruct) {
21537f756511SDominic Meiser     if ((*matstruct)->mat) {
21547f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
21557f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
215657d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
21577f756511SDominic Meiser       } else {
21587f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
21597f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
21607f756511SDominic Meiser       }
21617f756511SDominic Meiser     }
216257d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
21637f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2164c41cb2e2SAlejandro Lamas Daviña     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
21657656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
21667656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
21677f756511SDominic Meiser     delete *matstruct;
21687f756511SDominic Meiser     *matstruct = 0;
21697f756511SDominic Meiser   }
21707f756511SDominic Meiser   PetscFunctionReturn(0);
21717f756511SDominic Meiser }
21727f756511SDominic Meiser 
2173ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
21747f756511SDominic Meiser {
2175e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2176e6e9a74fSStefano Zampini 
21777f756511SDominic Meiser   PetscFunctionBegin;
21787f756511SDominic Meiser   if (*trifactors) {
2179e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2180e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2181e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2182e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
21837f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
21847f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
21857f756511SDominic Meiser     delete (*trifactors)->workVector;
2186ccdfe979SStefano Zampini     (*trifactors)->rpermIndices = 0;
2187ccdfe979SStefano Zampini     (*trifactors)->cpermIndices = 0;
2188ccdfe979SStefano Zampini     (*trifactors)->workVector = 0;
2189ccdfe979SStefano Zampini   }
2190ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2191ccdfe979SStefano Zampini }
2192ccdfe979SStefano Zampini 
2193ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2194ccdfe979SStefano Zampini {
2195e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2196ccdfe979SStefano Zampini   cusparseHandle_t handle;
2197ccdfe979SStefano Zampini   cusparseStatus_t stat;
2198ccdfe979SStefano Zampini 
2199ccdfe979SStefano Zampini   PetscFunctionBegin;
2200ccdfe979SStefano Zampini   if (*trifactors) {
2201e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
22027f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
220357d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
22047f756511SDominic Meiser     }
2205e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
22067f756511SDominic Meiser   }
22077f756511SDominic Meiser   PetscFunctionReturn(0);
22087f756511SDominic Meiser }
2209